Built with R version 4.2.2.

Setup

Libraries

# library(car)
library(cowplot)
library(data.table)
library(DESeq2)
library(DHARMa)
library(ggpubr)
library(grid)
library(gridExtra)
library(iNEXT)
library(knitr)
library(lmPerm)
library(MASS)
library(pscl)
# library(rcompanion)
library(seqinr)
library(tidyverse)
library(vegan)
library(viridis)

# library(devtools)
# install_github("eastmallingresearch/Metabarcoding_pipeline/scripts")
library(metafuncs)

Functions and constants

ALPHA =      0.1   # DESeq2 alpha value
OTUFILTER =  0.01  # Remove OTUs with proportion of total reads below value
READFILTER = 0.05  # Will remove samples with read sum below sample_median_reads*READFILTER 
PAIREDONLY = F     # Will remove the pair of samples which fail the readfilter - probably only useful for DESeq separated by type NOTE removes pairs before DESeq object is created   
TAXCONF =    0.80  # Sets the taxonomy confidence level to get "rank" in taxonomy files
TOPOTU =     10    # Number of Top OTUs for summary information
DIFFOTU =    200    # Number of Top OTUs for correlation analysis

# graphics
DEVICE =     "png"
DPI =        1200
WIDTH =      9
HEIGHT =     9

# Model design
FACTORS = c("Site", "Storage", "Scion")
DESIGN = y ~ Site + Storage + Scion
FULL_DESIGN = y ~ Site * Storage * Scion
canker_design = "Cankers ~ Site * Storage * Scion"
# colour blind palette
cbPalette <- c(
  "#000000", "#E69F00", "#56B4E9", "#009E73", 
  "#F0E442", "#0072B2", "#D55E00", "#CC79A7"
)

source("functions/metabarcoding.R")
source("functions/loadme.R")
source("functions/rarefaction.R")

Load data

Bacterial and fungal ASV (ZOTU) tables, sample metadata, and taxonomy files are loaded into named lists using the loadData function from Greg’s metafuncs package.

Site names are encoded as follows according to previous work:

metadata <- "sample_metadata.txt"

# Load data
ubiome_FUN <- loadData("data/FUN.zotu_table.txt",metadata,"data/zFUN.sintax.taxa",RHB="FUN")
ubiome_BAC <- loadData("data/BAC.zotu_table.txt",metadata,"data/zBAC.sintax.taxa",RHB="BAC")

# Change sites Avalon -> 1, Scripps -> 2, and WWF -> 3.
# Storage from planting date.
# No storage for December plantings, yes for March and April (4 months).
mutate_factors <- function(data){
  data <- data %>%
    rename(location = site, Scion = cultivar) %>%
    mutate(
      Site = case_when(
        location == "Avalon" ~ 1,
        location == "Scripps" ~ 2,
        location == "WWF" ~ 3
      ) %>% as.factor(),
      Storage = case_when(
        planting_date %in% c("march", "april") ~ "yes",
        planting_date %in% c("dec") ~ "no"
      ) %>% as.factor(),
      Scion = as.factor(Scion)
    )
  return(data)
}

ubiome_FUN$colData <- mutate_factors(ubiome_FUN$colData)
# Error in rename(., location = site, Scion = cultivar): unused arguments (location = site, Scion = cultivar)
ubiome_BAC$colData <- mutate_factors(ubiome_BAC$colData)
# Error in rename(., location = site, Scion = cultivar): unused arguments (location = site, Scion = cultivar)
# In taxData and countData replace 'OTU' with 'ASV'
rownames(ubiome_FUN$taxData) <- gsub("OTU", "ASV", rownames(ubiome_FUN$taxData))
rownames(ubiome_BAC$taxData) <- gsub("OTU", "ASV", rownames(ubiome_BAC$taxData))

rownames(ubiome_FUN$countData) <- gsub("OTU", "ASV", rownames(ubiome_FUN$countData))
rownames(ubiome_BAC$countData) <- gsub("OTU", "ASV", rownames(ubiome_BAC$countData))

Global removals

# Sample "A2-7" removed due to missampling.
ubiome_BAC$colData <- ubiome_BAC$colData[!rownames(ubiome_BAC$colData) %in% "HMA27", ]
ubiome_BAC$countData <- ubiome_BAC$countData[, !colnames(ubiome_BAC$countData) %in% "HMA27"]
ubiome_FUN$colData <- ubiome_FUN$colData[!rownames(ubiome_FUN$colData) %in% "HMA27", ]
ubiome_FUN$countData <- ubiome_FUN$countData[, !colnames(ubiome_FUN$countData) %in% "HMA27"]

Filter samples and ASVs

Filtering taxa

Plantae taxa are filtered from fungal taxData. Chloroplast and Eukaryote taxa are filtered from bacterial taxData. Corresponding ASVs are removed from countData.

# Filter Plant, Chloroplast, and Eukaryote ASVs

# Fungi: Plantae ASVs
cat("Fungi:", length(grep("Plantae", ubiome_FUN$taxData$kingdom)), "Plantae ASVs\n")
# Fungi: 0 Plantae ASVs
# Bacteria: Chloroplast (Streptophyta) and Eukaryote ASVs
cat(
  "Bacteria:", length(grep("Streptophyta", ubiome_BAC$taxData$genus)), "Chloroplast ASVs;", 
  length(grep("Eukaryota", ubiome_BAC$taxData$kingdom)), "Eukaryote ASVs\n"
)
# Bacteria: 37 Chloroplast ASVs; 188 Eukaryote ASVs
# Filter Chloroplast and Eukaryote
filt <- rownames(
  ubiome_BAC$taxData[
    grepl("Streptophyta", ubiome_BAC$taxData$genus) & 
    as.numeric(ubiome_BAC$taxData$g_conf) >= TAXCONF,
  ]
)

filt <- c(filt, rownames(ubiome_BAC$taxData[grep("Eukaryota", ubiome_BAC$taxData$kingdom), ]))

cat("Bacteria: removing", length(filt), "ASVs")
# Bacteria: removing 198 ASVs
ubiome_BAC$taxData <- ubiome_BAC$taxData[!rownames(ubiome_BAC$taxData) %in% filt, ]
ubiome_BAC$countData <- ubiome_BAC$countData[!rownames(ubiome_BAC$countData) %in% filt, ]

Filtering samples

Plot rarefaction curves.

Remove samples with read count below 5 % of median.

invisible(mapply(assign, names(ubiome_BAC), ubiome_BAC, MoreArgs = list(envir = globalenv())))
rare_bac <- gfunc(countData, colData, "Bacteria")
# rare_bac <- gfunc(as.data.frame(counts(dds)), as.data.frame(colData(dds)), "Bacteria ZOTU")
invisible(mapply(assign, names(ubiome_FUN), ubiome_FUN, MoreArgs = list(envir = globalenv())))
rare_fun <- gfunc(countData, colData, "Fungi")
# rare_fun <- gfunc(as.data.frame(counts(dds)), as.data.frame(colData(dds)), "Fungi ZOTU")

rarefaction_plots <- grid.arrange(
  rare_bac, rare_fun,
  left = textGrob(label = expression("log"[10] * " aligned sequences"), rot = 90),
  bottom = "ASV count", nrow = 2
)

ggsave(filename = "rarefaction_plots.png", plot = rarefaction_plots, path = "figures/")

rarefaction_plots

# Fungi
med <- median(colSums(ubiome_FUN$countData))
filt <- !colSums(ubiome_FUN$countData) > med * READFILTER
cat("Fungi: ",sum(filt),"sample(s) removed\n")

# Bacteria
med <- median(colSums(ubiome_BAC$countData))
filt <- !colSums(ubiome_BAC$countData) > med * READFILTER
cat("Bacteria: ",sum(filt),"sample(s) removed\n")

Filter ASVs

ASV read count

Number of ASVs which account for 50 %, 80 %, and 99 % of total reads.

asv_propotions <- function(countData, proportion){
  i <- sum(countData)
  y <- rowSums(countData)
  y <- y[order(y, decreasing = T)]
  asvs <- length(y[(cumsum(y) / i <= proportion)])
  return(asvs)
}

proportions <- c(0.5, 0.9, 0.99, 1)

top_asvs <- data.table(
  "proportion" = proportions,
  "Fungi" = lapply(proportions, function(x) asv_propotions(ubiome_FUN$countData, x)),
  "Bacteria" = lapply(proportions, function(x) asv_propotions(ubiome_BAC$countData, x))
)

top_asvs
#    proportion Fungi Bacteria
# 1:       0.50    10      169
# 2:       0.90   171     2186
# 3:       0.99   995     5883
# 4:       1.00  2401     7265

Filter ASVs

Remove ASVs with read count below 1 % of total reads.

# Fungi
keep <- filter_otus(ubiome_FUN$countData, OTUFILTER)
cat("Fungi: removing", nrow(ubiome_FUN$countData) - length(keep), "ASVs\n")
# Fungi: removing 1406 ASVs
ubiome_FUN$taxData <- ubiome_FUN$taxData[rownames(ubiome_FUN$taxData) %in% keep,]
ubiome_FUN$countData <- ubiome_FUN$countData[rownames(ubiome_FUN$countData) %in% keep,]

# Bacteria
keep <-  filter_otus(ubiome_BAC$countData, OTUFILTER)
cat("Bacteria: removing", nrow(ubiome_BAC$countData) - length(keep), "ASVs")
# Bacteria: removing 1382 ASVs
ubiome_BAC$taxData <- ubiome_BAC$taxData[rownames(ubiome_BAC$taxData) %in% keep,]
ubiome_BAC$countData <- ubiome_BAC$countData[rownames(ubiome_BAC$countData) %in% keep,]

Absolute abundance normalisation

ASV normalisation is performed using qPCR theoretical copy number data. Copy number is calculated per mg of root sample from the qPCR data.

Prepare qPCR abundance data

abundance <- fread("mean_abundance.csv")

# Add sample ID to abundance data
abundance$id <- paste0("HM", gsub("-", "", abundance$Sample))
# abundance$id <- abundance$Sample
abundance$copy_number <- abundance$MeanAdjustedTCN_mg
abundance$log_copy_number <- log10(abundance$copy_number)

# Add bacterial (16S) and fungal (ITS) abundance to ubiome BAC and FUN named lists
ubiome_FUN$abundance <- abundance[abundance$Target == "ITS"] %>%
  column_to_rownames(var = "id")
ubiome_BAC$abundance <- abundance[abundance$Target == "16S"] %>%
  column_to_rownames(var = "id")

# Merge copy number from abundance with colData
ubiome_FUN$colData <- merge(
  ubiome_FUN$colData, 
  ubiome_FUN$abundance[, c("Target", "copy_number", "log_copy_number")], 
  by = 0
) %>% column_to_rownames(var = "Row.names")
ubiome_BAC$colData <- merge(
  ubiome_BAC$colData, 
  ubiome_BAC$abundance[, c("Target", "copy_number", "log_copy_number")], 
  by = 0
) %>% column_to_rownames(var = "Row.names")

Remove outliers

# Detect outliers with std > threshold from the median
detect_outliers <- function(x, val, threshold, na.rm = TRUE) {
  med_x <- median(x[[val]], na.rm = na.rm)
  sd_x <- sd(x[[val]], na.rm = na.rm)
  outliers <- x[x[[val]] > (med_x + threshold * sd_x) | x[[val]] < (med_x - threshold * sd_x), ]
  return(outliers)
}

outliers_FUN <- detect_outliers(ubiome_FUN$abundance, "MeanAdjustedTCN_mg", 3)
outliers_BAC <- detect_outliers(ubiome_BAC$abundance, "MeanAdjustedTCN_mg", 3)

# Remove samples with copy number > 3 std from the median
outliers <- rownames(outliers_FUN)
ubiome_FUN$abundance <- ubiome_FUN$abundance[!rownames(ubiome_FUN$abundance) %in% outliers, ]
ubiome_FUN$countData <- ubiome_FUN$countData[, !colnames(ubiome_FUN$countData) %in% outliers]
ubiome_FUN$colData <- ubiome_FUN$colData[!rownames(ubiome_FUN$colData) %in% outliers, ]

cat("Fungi: removing", length(outliers), "outlier(s)\n")
# Fungi: removing 1 outlier(s)

Sample A1-3 is removed from the fungal data due to abnormally high copy number.

Canker count data

Canker count data for sampled trees only.

# Canker count data for sampled trees only

canker_data <- fread("canker_data.csv", select = c(1:5, 7:34))

# Remove spaces from column names and convert to lowercase
colnames(canker_data) <- tolower(gsub(" ", "_", colnames(canker_data)))

# Codify site names, add storage and total canker count for timepoint 4
canker_data <- mutate(
  canker_data,
  Site = case_when(
    site == "Avalon" ~ 1,
    site == "Scripps" ~ 2,
    site == "WWF" ~ 3
  ) %>% as.factor(),
  Storage = case_when(
    planting_date %in% c("March", "April") ~ "yes",
    planting_date %in% c("Dec") ~ "no"
  ),
  Scion = as.factor(cultivar),
  total_cankers = a4 + b4 + c4 + d4 + e4
)

# Identify samples with missing values
missing <- unique(canker_data[!complete.cases(canker_data), code])

# Also remove sample A2-7 due to missampling
missing <- c(missing, "HMA27")

# Remove missing samples from canker data
canker_data <- canker_data[!canker_data$code %in% missing, ]

# Verify that there are two trees for each sample
canker_data %>% group_by(code) %>% summarise(n = n()) %>% filter(n != 2)
# Error in `n()`:
# ! Must only be used inside data-masking verbs like `mutate()`, `filter()`, and `group_by()`.
# Sum of total cankers for each pair of trees with matching code
cankers <- canker_data %>% 
  group_by(code) %>% 
  summarise(
    Site = first(Site),
    Storage = first(Storage),
    Scion = first(Scion),
    Cankers = sum(total_cankers)
  ) %>% 
  column_to_rownames("code")
# Error:
# ! Can't find column `code` in `.data`.
# Add total canker count to colData for both FUN and BAC
ubiome_FUN$colData <- merge(
  ubiome_FUN$colData, 
  cankers["Cankers"], 
  by = 0,
  all.x = T
) %>% column_to_rownames("Row.names")

ubiome_BAC$colData <- merge(
  ubiome_BAC$colData, 
  cankers["Cankers"], 
  by = 0,
  all.x = T
) %>% column_to_rownames("Row.names")

Summary stats

# png("figures/hist.png", width = 800, height = 600)
# hist(cankers$Cankers, breaks = 20, main = "Total canker count", xlab = "Total canker count")
# dev.off()

cankers_hist <- ggdensity(
  cankers, x = "Cankers", fill = "Site", facet.by = "Site", ncol = 1,
  add = "mean", rug = T, palette = cbPalette,
  title = "Total canker count", xlab = "Total canker count"
)

cankers_hist

ggsave(filename = "cankers_hist.png", plot = cankers_hist, path = "figures/")

cankers_box <- ggboxplot(
  cankers, x = "Site", y = "Cankers", palette = cbPalette,
  color = "Scion", add = "jitter", legend = "top", 
  title = "Total canker count", xlab = "Site", ylab = "Total canker count"
)

cankers_box

ggsave(filename = "cankers_box.png", plot = cankers_box, path = "figures/")

cankers_bar <- ggbarplot(
  cankers, x = "Site", y = "Cankers", fill = "Scion", 
  palette = cbPalette, add = "mean_se", position = position_dodge(0.8),
  title = "Total canker count", xlab = "Site", ylab = "Total canker count"
)

cankers_bar

ggsave(filename = "cankers_bar.png", plot = cankers_bar, path = "figures/")

GLM

# Effect of Site, Scion, and Storage on canker count

# Formula
formula <- update(FULL_DESIGN, Cankers ~ .)
# formula <- Cankers ~ Site + Storage + Scion + site:Storage + site:Scion + Storage:Scion

# Log-linear model
canker_lm <- lm(update(FULL_DESIGN, log(Cankers + 1) ~ .), data = cankers)

par(mfrow = c(2, 2))
plot(canker_lm)

# Residual checking
res <- resid(canker_lm, type = "pearson")

# Poisson model
canker_poisson <- glm(formula, data = cankers, family = "poisson")

poisson_plot <- plot(simulateResiduals(canker_poisson), title = "Poisson model")

# Model overdispersed

# Negative binomial model
canker_negbin <- glm.nb(formula, data = cankers)

sim <- simulateResiduals(canker_negbin)

plot(sim, title = "Negative binomial model")

# canker_model_plots <- ggarrange(lm_plot, poisson_plot, negbin_plot, ncol = 3)

# ggsave(filename = "canker_model_plots.png", plot = canker_model_plots, path = "figures/")

# png("figures/canker_residuals.png", width = 800, height = 600)
# plot(sim)
# dev.off()

testZeroInflation(sim)
# 
#   DHARMa zero-inflation test via comparison to expected zeros with
#   simulation under H0 = fitted model
# 
# data:  simulationOutput
# ratioObsSim = 0.77399, p-value = 0.76
# alternative hypothesis: two.sided
nagelkerke(canker_negbin)
# Error in nagelkerke(canker_negbin): could not find function "nagelkerke"
# Model good fit

canker_anova <- anova(canker_negbin, test = "Chisq") %>% data.frame()
total_deviance <- sum(canker_anova$Deviance, na.rm = T) + tail(canker_anova$Resid..Dev, 1)
canker_anova$Perc..Dev <- canker_anova$Deviance / total_deviance * 100

canker_anova
#                    Df     Deviance Resid..Df Resid..Dev     Pr..Chi.
# NULL               NA           NA        73  314.98196           NA
# Site                2 118.75744595        71  196.22452 1.629852e-26
# Storage             1   0.02066259        70  196.20386 8.857019e-01
# Scion               6  24.29375368        64  171.91010 4.611042e-04
# Site:Storage        2  28.86115598        62  143.04895 5.406044e-07
# Site:Scion         12  31.65405909        50  111.39489 1.564383e-03
# Storage:Scion       6   7.40666778        44  103.98822 2.848693e-01
# Site:Storage:Scion 11  26.62949240        33   77.35873 5.225415e-03
#                       Perc..Dev
# NULL                         NA
# Site               37.702935303
# Storage             0.006559927
# Scion               7.712744377
# Site:Storage        9.162796386
# Site:Scion         10.049483066
# Storage:Scion       2.351457744
# Site:Storage:Scion  8.454291190

Create DESeq objects

# Make sure countData and colData still match, if they do, create DESeq objects, if not throw error
if(identical(colnames(ubiome_FUN$countData), rownames(ubiome_FUN$colData))) {
  # Create DESeq object
  ubiome_FUN$dds <- ubiom_to_des(ubiome_FUN)
  print("FUN DESeq object created")
} else {
  stop("FUN countData and colData do not match")
}
# [1] "FUN DESeq object created"
if(identical(colnames(ubiome_BAC$countData), rownames(ubiome_BAC$colData))) {
  # Create DESeq object
  ubiome_BAC$dds <- ubiom_to_des(ubiome_BAC)
  print("BAC DESeq object created")
} else {
  stop("BAC countData and colData do not match")
}
# [1] "BAC DESeq object created"

Abundance normalisation

Absolute abundance normalisation using DESeq2 size factors.

Values are centred around the mean of the copy number.

# Normalise count data using DESeq2 size factors

ubiome_FUN$dds$sizeFactor <- ubiome_FUN$dds$copy_number / mean(ubiome_FUN$dds$copy_number)
ubiome_BAC$dds$sizeFactor <- ubiome_BAC$dds$copy_number / mean(ubiome_BAC$dds$copy_number)
# Save environment
save.image("data_loaded.RData")

Fungi

# Unpack fungi data
invisible(mapply(assign, names(ubiome_FUN), ubiome_FUN, MoreArgs = list(envir = globalenv())))

ASV and sample summary

Read and sample summary

cat(
  "Raw reads", "\n\n",
  "Total raw reads:\t\t", sum(countData), "\n",
  "Mean raw reads per sample:\t", mean(colSums(countData)), "\n",
  "Median raw reads per sample:\t", median(colSums(countData)), "\n",
  "Max raw reads per sample:\t", max(colSums(countData)), "\n",
  "Min raw reads per sample:\t", min(colSums(countData)), "\n\n"
)
# Raw reads 
# 
#  Total raw reads:      7293776 
#  Mean raw reads per sample:    90046.62 
#  Median raw reads per sample:  93435 
#  Max raw reads per sample:     113518 
#  Min raw reads per sample:     38472
#colSums(countData)

nct <- counts(dds, normalize = T)
cat("Normalised reads", "\n\n",
  "Total normalised reads:\t\t", sum(nct), "\n",
  "Mean normalised reads per sample:\t", mean(colSums(nct)), "\n",
  "Median normalised reads per sample:\t", median(colSums(nct)), "\n",
  "Min normalised reads per sample:\t", min(colSums(nct)), "\n",
  "Max normalised reads per sample:\t", max(colSums(nct)), "\n\n"
)
# Normalised reads 
# 
#  Total normalised reads:       12468857 
#  Mean normalised reads per sample:     153936.5 
#  Median normalised reads per sample:   98624.28 
#  Min normalised reads per sample:  28901.7 
#  Max normalised reads per sample:  881441.3
#round(colSums(counts(dds,normalize = T)),0)

ASV summary

cat(
  "Total ASVs:\t\t", nrow(taxData),"\n\n",
  "Raw reads per ASV summary", "\n\n",
  "Mean raw reads per ASV:\t", mean(rowSums(countData)),"\n",
  "Median raw per ASV:\t\t", median(rowSums(countData)),"\n",
  "ASV raw Min reads:\t\t", min(rowSums(countData)),"\n",
  "ASV raw Max reads:\t\t", max(rowSums(countData)),"\n\n"
)
# Total ASVs:        995 
# 
#  Raw reads per ASV summary 
# 
#  Mean raw reads per ASV:   7330.428 
#  Median raw per ASV:       588 
#  ASV raw Min reads:        115 
#  ASV raw Max reads:        714327
cat(
  "Normalised reads per ASV summary","\n\n",
  "Mean normalised reads per ASV:\t\t", mean(rowSums(nct)),"\n",
  "Median normalised reads per ASV:\t", median(rowSums(nct)),"\n",
  "ASV normalised Min reads:\t\t", min(rowSums(nct)),"\n",
  "ASV normalised Max reads:\t\t", max(rowSums(nct)),"\n\n"
)
# Normalised reads per ASV summary 
# 
#  Mean normalised reads per ASV:        12531.51 
#  Median normalised reads per ASV:  1025.725 
#  ASV normalised Min reads:         101.2814 
#  ASV normalised Max reads:         1509459
y <- rowSums(nct)
y <- y[order(y, decreasing = T)]
# proportion
xy <- y / sum(y)

cat("Top " ,TOPOTU, "ASVs:\n")
# Top  10 ASVs:
data.frame(counts = y[1:TOPOTU], proportion = xy[1:TOPOTU], rank = taxData[names(y)[1:TOPOTU],]$rank)
#          counts proportion                          rank
# ASV2  1509458.8 0.12105832                 Ascomycota(p)
# ASV1  1490469.5 0.11953538 Dactylonectria macrodidyma(s)
# ASV5  1068164.1 0.08566657              Leotiomycetes(c)
# ASV4  1059908.0 0.08500442                 Ascomycota(p)
# ASV3   480660.1 0.03854885    Ilyonectria destructans(s)
# ASV7   290896.6 0.02332985                   Fusarium(g)
# ASV6   227927.9 0.01827978        Ilyonectria robusta(s)
# ASV9   201690.6 0.01617555                 Ascomycota(p)
# ASV8   191083.5 0.01532486                   Fusarium(g)
# ASV11  131684.2 0.01056105      Truncatella angustata(s)

Taxonomy Summary

Taxonomy identifiable

Proportion of ASVs which can be assigned (with the given confidence) at each taxonomic rank.

# Proportion of ASVs which can be assigned (with the given confidence) at each taxonomic rank

tx <- copy(taxData)
setDT(tx)
cols <- names(tx)[9:15]

tx[, (cols) := lapply(.SD, as.factor), .SDcols = cols]

data.table(
  rank = c("kingdom", "phylum", "class", "order", "family", "genus", "species"),
  "0.8" = round(unlist(lapply(cols, function(col) sum(as.number(tx[[col]]) >= 0.8) / nrow(tx))), 2),
  "0.65" = round(unlist(lapply(cols, function(col) sum(as.number(tx[[col]]) >= 0.65) / nrow(tx))), 2),
  "0.5" = round(unlist(lapply(cols, function(col) sum(as.number(tx[[col]]) >= 0.5) / nrow(tx))), 2)
)
#       rank  0.8 0.65  0.5
# 1: kingdom 1.00 1.00 1.00
# 2:  phylum 0.84 0.87 0.90
# 3:   class 0.70 0.74 0.78
# 4:   order 0.54 0.60 0.64
# 5:  family 0.42 0.45 0.49
# 6:   genus 0.38 0.42 0.47
# 7: species 0.24 0.30 0.35

% of reads which can be assigned to each taxonomic ranks

tx <-taxData[rownames(dds),]
nc <- counts(dds, normalize = T)
ac <- sum(nc)

data.table(
  rank = c("kingdom", "phylum", "class", "order", "family", "genus", "species"),
  "0.8" = round(unlist(lapply(cols, function(col)(sum(nc[which(as.numeric(tx[[col]]) >= 0.8),]) / ac * 100))), 2),
  "0.65" = round(unlist(lapply(cols, function(col)(sum(nc[which(as.numeric(tx[[col]]) >= 0.65),]) / ac * 100))), 2),
  "0.5" = round(unlist(lapply(cols, function(col)(sum(nc[which(as.numeric(tx[[col]]) >= 0.5),]) / ac * 100))), 2)
)
#       rank    0.8   0.65    0.5
# 1: kingdom 100.00 100.00 100.00
# 2:  phylum  84.14  96.59  96.83
# 3:   class  60.12  70.92  71.47
# 4:   order  53.49  58.87  68.76
# 5:  family  44.97  46.80  50.25
# 6:   genus  46.06  48.01  50.72
# 7: species  30.44  36.70  41.62

Taxonomy plots

Plots of proportion of normalised reads assigned to members of phylum and class.

dat <- list(as.data.frame(counts(dds, normalize = T)), taxData, as.data.frame(colData(dds)))

design <- c("Site", "Storage")

# md1 <- getSummedTaxa(dat, conf = TAXCONF, design = design, cutoff = 0.1)
md1 <- getSummedTaxa(dat, conf = TAXCONF, design = design, taxon = "phylum", cutoff = 0.1)
# Error in `[.data.frame`(obj[[3]], , design, drop = FALSE): undefined columns selected
md1[, Site := factor(Site, levels = c(1, 2, 3))]
md1[, Storage := factor(Storage, levels = c("no", "yes"))]
md1[, taxon := factor(taxon, levels = unique(taxon[order(value, decreasing = T)]))]

removals <- md1[, .(value = mean(value)), by = "taxon"][value < 0.5, taxon]
md1 <- md1[!taxon %in% removals, ]

fun_phylum_plot <- plotfun1(md1, x = "taxon", fill = "Site") +
  facet_wrap(~ Storage)

ggsave("figures/fun_phylum.png", fun_phylum_plot, width = 25, height = 15, units = "cm")

fun_phylum_plot

md2 <- getSummedTaxa(dat, conf = TAXCONF, design = design, taxon = "class", cutoff = 0.1)
# Error in `[.data.frame`(obj[[3]], , design, drop = FALSE): undefined columns selected
md2[, Site := factor(Site, levels = c(1, 2, 3))]
md2[, Storage := factor(Storage, levels = c("no", "yes"))]
md2[, taxon := factor(taxon, levels = unique(taxon[order(value, decreasing = T)]))]

removals <- md2[, .(value = mean(value)), by = "taxon"][value < 0.5, taxon]
md2 <- md2[!taxon %in% removals, ]

fun_class_plot <- plotfun1(md2, x = "taxon", fill = "Site") +
  facet_wrap(~ Storage)

ggsave("figures/fun_class.png", fun_class_plot, width = 25, height = 15, units = "cm")

fun_class_plot

Abundance

Plot copy number for each sample grouped by site, Scion, and Storage. Test the effect of site, Scion, and Storage on copy number using ANOVA.

# abundance_plot <- ggplot(
#   data = as.data.frame(colData(dds)), 
#   aes(x = site, y = log_copy_number, colour = Scion, shape = Storage)
# ) + geom_jitter() + 
#   scale_colour_manual(values = cbPalette)

fun_abundance_box <- ggboxplot(
  data = as.data.frame(colData(dds)), x = "Site", y = "log_copy_number", 
  color = "Scion", add = "jitter", legend = "top", 
  title = "Fungal abundance", xlab = "Site", ylab = "log10 copy number"
)
# Error in `purrr::pmap()`:
# ℹ In index: 1.
# ℹ With name: log_copy_number.
# Caused by error in `[[<-.data.frame`:
# ! replacement has 0 rows, data has 81
ggsave(
  filename = "fun_abundance.png", plot = fun_abundance_box, path = "figures/", 
  height = 20, width = 20, units = "cm"
)

fun_abundance_box

fun_abundance_bar <- ggbarplot(
  data = as.data.frame(colData(dds)), x = "Storage", y = "log_copy_number", 
  fill = "Site", add = "mean_se", 
  palette = cbPalette, position = position_dodge(0.8),
  title = "(a) Fungal abundance", xlab = "Storage ", ylab = "Mean copy number (log10)"
) + guides(fill = guide_legend(title = "Site"))
# Error in `purrr::pmap()`:
# ℹ In index: 1.
# ℹ With name: log_copy_number.
# Caused by error in `dplyr::pull()`:
# ! Can't extract columns that don't exist.
# ✖ Column `Storage` doesn't exist.
ggsave(
  filename = "fun_abundance_bar.png", plot = fun_abundance_bar, path = "figures/", 
  height = 20, width = 20, units = "cm"
)

fun_abundance_bar

# Formula for ANOVA
formula <- update(FULL_DESIGN, log_copy_number ~ .)

abundance_anova <- aov(formula, data = as.data.frame(colData(dds)))
# Error in eval(predvars, data, env): object 'Site' not found
# Normality check
par(mfrow = c(2, 2))
plot(abundance_anova)

png("figures/fun_abundance_norm.png", width = 800, height = 600)
par(mfrow = c(2, 2))
plot(abundance_anova)
dev.off()
# png 
#   2
# Results
summary(abundance_anova)
#                    Df Sum Sq Mean Sq F value   Pr(>F)    
# Site                2 1.9657  0.9828  25.302 7.91e-08 ***
# Storage             1 0.0798  0.0798   2.056   0.1594    
# Scion               6 0.5131  0.0855   2.202   0.0628 .  
# Site:Storage        2 0.0768  0.0384   0.989   0.3809    
# Site:Scion         12 0.2677  0.0223   0.574   0.8494    
# Storage:Scion       6 0.1640  0.0273   0.704   0.6484    
# Site:Storage:Scion 12 0.1889  0.0157   0.405   0.9530    
# Residuals          40 1.5538  0.0388                     
# ---
# Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
abundance_results <- abundance_anova %>% summary() %>% unclass() %>% data.frame()
total_variance <- sum(abundance_results$Sum.Sq)
abundance_results$Perc.Var <- abundance_results$Sum.Sq / total_variance * 100

abundance_results
#                    Df     Sum.Sq    Mean.Sq    F.value       Pr..F.  Perc.Var
# Site                2 1.96566003 0.98283002 25.3015018 7.913123e-08 40.867065
# Storage             1 0.07984549 0.07984549  2.0555037 1.594283e-01  1.660028
# Scion               6 0.51312816 0.08552136  2.2016206 6.280900e-02 10.668194
# Site:Storage        2 0.07683263 0.03841631  0.9889711 3.808702e-01  1.597389
# Site:Scion         12 0.26772071 0.02231006  0.5743394 8.494197e-01  5.566049
# Storage:Scion       6 0.16398521 0.02733087  0.7035927 6.484006e-01  3.409335
# Site:Storage:Scion 12 0.18892649 0.01574387  0.4053027 9.530223e-01  3.927877
# Residuals          40 1.55378922 0.03884473         NA           NA 32.304063

Alpha diversity analysis

Alpha diversity plot

# plot alpha diversity - plot_alpha will convert normalised abundances to integer values

fun_alpha_plot <- plot_alpha(
  counts(dds, normalize = F), colData(dds),
  design = "Scion", colour = "Site",
  measures = c("Shannon", "Simpson"),
  type = "bar"
) + scale_colour_manual(values = cbPalette) + 
  theme(axis.title.x = element_blank()) +
  ggtitle("Fungal α-diversity")

ggsave(
  filename = "fun_alpha.png", plot = fun_alpha_plot, path = "figures/", 
  height = 20, width = 40, units = "cm"
)
# Error in `geom_point()`:
# ! Problem while computing aesthetics.
# ℹ Error occurred in the 1st layer.
# Caused by error in `FUN()`:
# ! object 'Scion' not found
fun_alpha_plot
# Error in `geom_point()`:
# ! Problem while computing aesthetics.
# ℹ Error occurred in the 1st layer.
# Caused by error in `FUN()`:
# ! object 'Scion' not found

Permutation based anova on diversity index ranks

# get the diversity index data
all_alpha_ord <- plot_alpha(
  counts(dds, normalize = F),
  colData(dds),
  returnData = T
)

# join diversity indices and metadata
all_alpha_ord <- all_alpha_ord[
  as.data.table(colData(dds), keep.rownames = "Samples"), 
  on = "Samples"
]

fun_alpha <- all_alpha_ord

formula <- FULL_DESIGN # x ~ Site * Storage * Scion + Site / Site.block

Chao1

setkey(all_alpha_ord, S.chao1)
all_alpha_ord[, measure := as.numeric(as.factor(S.chao1))]
result <- aovp(update(formula, measure ~ .), all_alpha_ord, seqs = T)
# Error in eval(predvars, data, env): object 'Site' not found
summary(result)
#        Df        SumOfSqs             R2                F          
#  Min.   : 1   Min.   : 0.2240   Min.   :0.01514   Min.   : 0.8302  
#  1st Qu.: 2   1st Qu.: 0.7189   1st Qu.:0.04859   1st Qu.: 0.9901  
#  Median : 6   Median : 1.2000   Median :0.08112   Median : 1.3577  
#  Mean   :18   Mean   : 3.2874   Mean   :0.22222   Mean   : 4.0601  
#  3rd Qu.:12   3rd Qu.: 4.6757   3rd Qu.:0.31606   3rd Qu.: 2.4219  
#  Max.   :81   Max.   :14.7934   Max.   :1.00000   Max.   :19.4091  
#                                                   NA's   :2        
#      Pr(>F)        
#  Min.   :0.000999  
#  1st Qu.:0.029970  
#  Median :0.062937  
#  Mean   :0.282575  
#  3rd Qu.:0.481519  
#  Max.   :0.891109  
#  NA's   :2
df <- result %>% summary() %>% unclass() %>% data.frame()
total_variance <- sum(df$R.Sum.Sq)
df$Perc.Var <- df$R.Sum.Sq / total_variance * 100
# Error in `$<-.data.frame`(`*tmp*`, Perc.Var, value = numeric(0)): replacement has 0 rows, data has 7
df
#        X......Df      X...SumOfSqs         X......R2          X......F
# X   Min.   : 1   Min.   : 0.2240   Min.   :0.01514   Min.   : 0.8302  
# X.1 1st Qu.: 2   1st Qu.: 0.7189   1st Qu.:0.04859   1st Qu.: 0.9901  
# X.2 Median : 6   Median : 1.2000   Median :0.08112   Median : 1.3577  
# X.3 Mean   :18   Mean   : 3.2874   Mean   :0.22222   Mean   : 4.0601  
# X.4 3rd Qu.:12   3rd Qu.: 4.6757   3rd Qu.:0.31606   3rd Qu.: 2.4219  
# X.5 Max.   :81   Max.   :14.7934   Max.   :1.00000   Max.   :19.4091  
# X.6         <NA>              <NA>              <NA>       NA's   :2  
#            X....Pr..F.
# X   Min.   :0.000999  
# X.1 1st Qu.:0.029970  
# X.2 Median :0.062937  
# X.3 Mean   :0.282575  
# X.4 3rd Qu.:0.481519  
# X.5 Max.   :0.891109  
# X.6        NA's   :2

Shannon

setkey(all_alpha_ord, shannon)
all_alpha_ord[, measure := as.numeric(as.factor(shannon))]
result <- aovp(update(formula, measure ~ .), all_alpha_ord, seqs = T)
# Error in eval(predvars, data, env): object 'Site' not found
summary(result)
#        Df        SumOfSqs             R2                F          
#  Min.   : 1   Min.   : 0.2240   Min.   :0.01514   Min.   : 0.8302  
#  1st Qu.: 2   1st Qu.: 0.7189   1st Qu.:0.04859   1st Qu.: 0.9901  
#  Median : 6   Median : 1.2000   Median :0.08112   Median : 1.3577  
#  Mean   :18   Mean   : 3.2874   Mean   :0.22222   Mean   : 4.0601  
#  3rd Qu.:12   3rd Qu.: 4.6757   3rd Qu.:0.31606   3rd Qu.: 2.4219  
#  Max.   :81   Max.   :14.7934   Max.   :1.00000   Max.   :19.4091  
#                                                   NA's   :2        
#      Pr(>F)        
#  Min.   :0.000999  
#  1st Qu.:0.029970  
#  Median :0.062937  
#  Mean   :0.282575  
#  3rd Qu.:0.481519  
#  Max.   :0.891109  
#  NA's   :2
df <- result %>% summary() %>% unclass() %>% data.frame()
total_variance <- sum(df$R.Sum.Sq)
df$Perc.Var <- df$R.Sum.Sq / total_variance * 100
# Error in `$<-.data.frame`(`*tmp*`, Perc.Var, value = numeric(0)): replacement has 0 rows, data has 7
df
#        X......Df      X...SumOfSqs         X......R2          X......F
# X   Min.   : 1   Min.   : 0.2240   Min.   :0.01514   Min.   : 0.8302  
# X.1 1st Qu.: 2   1st Qu.: 0.7189   1st Qu.:0.04859   1st Qu.: 0.9901  
# X.2 Median : 6   Median : 1.2000   Median :0.08112   Median : 1.3577  
# X.3 Mean   :18   Mean   : 3.2874   Mean   :0.22222   Mean   : 4.0601  
# X.4 3rd Qu.:12   3rd Qu.: 4.6757   3rd Qu.:0.31606   3rd Qu.: 2.4219  
# X.5 Max.   :81   Max.   :14.7934   Max.   :1.00000   Max.   :19.4091  
# X.6         <NA>              <NA>              <NA>       NA's   :2  
#            X....Pr..F.
# X   Min.   :0.000999  
# X.1 1st Qu.:0.029970  
# X.2 Median :0.062937  
# X.3 Mean   :0.282575  
# X.4 3rd Qu.:0.481519  
# X.5 Max.   :0.891109  
# X.6        NA's   :2

Simpson

setkey(all_alpha_ord, simpson)
all_alpha_ord[, measure := as.numeric(as.factor(simpson))]
result <- aovp(update(formula, measure ~ .), all_alpha_ord, seqs = T)
# Error in eval(predvars, data, env): object 'Site' not found
summary(result)
#        Df        SumOfSqs             R2                F          
#  Min.   : 1   Min.   : 0.2240   Min.   :0.01514   Min.   : 0.8302  
#  1st Qu.: 2   1st Qu.: 0.7189   1st Qu.:0.04859   1st Qu.: 0.9901  
#  Median : 6   Median : 1.2000   Median :0.08112   Median : 1.3577  
#  Mean   :18   Mean   : 3.2874   Mean   :0.22222   Mean   : 4.0601  
#  3rd Qu.:12   3rd Qu.: 4.6757   3rd Qu.:0.31606   3rd Qu.: 2.4219  
#  Max.   :81   Max.   :14.7934   Max.   :1.00000   Max.   :19.4091  
#                                                   NA's   :2        
#      Pr(>F)        
#  Min.   :0.000999  
#  1st Qu.:0.029970  
#  Median :0.062937  
#  Mean   :0.282575  
#  3rd Qu.:0.481519  
#  Max.   :0.891109  
#  NA's   :2
df <- result %>% summary() %>% unclass() %>% data.frame()
total_variance <- sum(df$R.Sum.Sq)
df$Perc.Var <- df$R.Sum.Sq / total_variance * 100
# Error in `$<-.data.frame`(`*tmp*`, Perc.Var, value = numeric(0)): replacement has 0 rows, data has 7
df
#        X......Df      X...SumOfSqs         X......R2          X......F
# X   Min.   : 1   Min.   : 0.2240   Min.   :0.01514   Min.   : 0.8302  
# X.1 1st Qu.: 2   1st Qu.: 0.7189   1st Qu.:0.04859   1st Qu.: 0.9901  
# X.2 Median : 6   Median : 1.2000   Median :0.08112   Median : 1.3577  
# X.3 Mean   :18   Mean   : 3.2874   Mean   :0.22222   Mean   : 4.0601  
# X.4 3rd Qu.:12   3rd Qu.: 4.6757   3rd Qu.:0.31606   3rd Qu.: 2.4219  
# X.5 Max.   :81   Max.   :14.7934   Max.   :1.00000   Max.   :19.4091  
# X.6         <NA>              <NA>              <NA>       NA's   :2  
#            X....Pr..F.
# X   Min.   :0.000999  
# X.1 1st Qu.:0.029970  
# X.2 Median :0.062937  
# X.3 Mean   :0.282575  
# X.4 3rd Qu.:0.481519  
# X.5 Max.   :0.891109  
# X.6        NA's   :2

Beta diversity PCA/NMDS

PCA

# Number of PCs to include
n_pcs <- 10

# Perform PC decomposition of DES object
mypca <- des_to_pca(dds)

# To get pca plot axis into the same scale create a dataframe of PC scores multiplied by their variance
fun_pca <- t(data.frame(t(mypca$x) * mypca$percentVar))

formula = FULL_DESIGN

Percent variation in first 10 PCs

# Cumulative percentage of variance explained
pca_cum_var <- data.frame(
  cumulative = cumsum(mypca$percentVar * 100),
  no = 1:length(mypca$percentVar)
)

# Plot cumulative percentage of variance explained
fun_cum_pca <- ggline(
  pca_cum_var, x = "no", y = "cumulative", plot_type = "l",
  xlab = "Number of PCs", ylab = "Cumulative % variance explained",
  title = "Fungi: cumulative % variance explained by PCs"
)
ggsave(filename = "fun_cum_pca.png", plot = fun_cum_pca, path = "figures/",)
fun_cum_pca

pca_var <- data.frame(
  PC = paste0("PC", 1:n_pcs),
  perc_var = round(mypca$percentVar[1:n_pcs] * 100, 1)
)

pca_var
#      PC perc_var
# 1   PC1     27.1
# 2   PC2     21.2
# 3   PC3      8.4
# 4   PC4      4.2
# 5   PC5      3.3
# 6   PC6      2.6
# 7   PC7      1.9
# 8   PC8      1.8
# 9   PC9      1.7
# 10 PC10      1.6

ANOVA of first 10 PCs

pca_summary <- apply(
  mypca$x[, 1:n_pcs], 2, 
  function(x){
    summary(aov(update(formula, x ~ .), data = as.data.frame(cbind(x, colData(dds)))))
  }
)
# Error in h(simpleError(msg, call)): error in evaluating the argument 'object' in selecting a method for function 'summary': object 'Site' not found
pca_summary
# $PC1
#                    Df Sum Sq Mean Sq F value Pr(>F)    
# Site                2 190646   95323 224.072 <2e-16 ***
# Storage             1     25      25   0.058 0.8106    
# Scion               6   4227     704   1.656 0.1572    
# Site:Storage        2   2566    1283   3.015 0.0603 .  
# Site:Scion         12   8401     700   1.646 0.1176    
# Storage:Scion       6   1527     255   0.598 0.7298    
# Site:Storage:Scion 12   5124     427   1.004 0.4634    
# Residuals          40  17016     425                   
# ---
# Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# 
# $PC2
#                    Df Sum Sq Mean Sq F value Pr(>F)    
# Site                2 137668   68834 276.644 <2e-16 ***
# Storage             1    142     142   0.572  0.454    
# Scion               6   1474     246   0.987  0.447    
# Site:Storage        2    604     302   1.213  0.308    
# Site:Scion         12   4425     369   1.482  0.172    
# Storage:Scion       6    711     119   0.476  0.822    
# Site:Storage:Scion 12   2035     170   0.681  0.759    
# Residuals          40   9953     249                   
# ---
# Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# 
# $PC3
#                    Df Sum Sq Mean Sq F value   Pr(>F)    
# Site                2   2673    1337   1.562 0.222149    
# Storage             1     30      30   0.035 0.851920    
# Scion               6  13665    2278   2.663 0.028714 *  
# Site:Storage        2  14748    7374   8.621 0.000771 ***
# Site:Scion         12  10740     895   1.046 0.428331    
# Storage:Scion       6   8140    1357   1.586 0.176467    
# Site:Storage:Scion 12   2870     239   0.280 0.989466    
# Residuals          40  34217     855                     
# ---
# Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# 
# $PC4
#                    Df Sum Sq Mean Sq F value  Pr(>F)   
# Site                2   5281  2640.3   5.834 0.00598 **
# Storage             1    635   634.6   1.402 0.24336   
# Scion               6   3161   526.9   1.164 0.34456   
# Site:Storage        2    630   315.1   0.696 0.50442   
# Site:Scion         12  13692  1141.0   2.521 0.01418 * 
# Storage:Scion       6   1827   304.5   0.673 0.67221   
# Site:Storage:Scion 12   6439   536.6   1.186 0.32589   
# Residuals          40  18104   452.6                   
# ---
# Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# 
# $PC5
#                    Df Sum Sq Mean Sq F value  Pr(>F)   
# Site                2      5       2   0.006 0.99447   
# Storage             1   4033    4033   9.433 0.00382 **
# Scion               6   1458     243   0.568 0.75299   
# Site:Storage        2   5967    2984   6.979 0.00251 **
# Site:Scion         12   2582     215   0.503 0.90010   
# Storage:Scion       6   1984     331   0.773 0.59554   
# Site:Storage:Scion 12   3480     290   0.678 0.76149   
# Residuals          40  17102     428                   
# ---
# Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# 
# $PC6
#                    Df Sum Sq Mean Sq F value Pr(>F)  
# Site                2     32    16.2   0.055 0.9467  
# Storage             1   1498  1497.8   5.070 0.0299 *
# Scion               6   4754   792.3   2.682 0.0278 *
# Site:Storage        2    811   405.3   1.372 0.2653  
# Site:Scion         12   2040   170.0   0.575 0.8486  
# Storage:Scion       6   1724   287.4   0.973 0.4560  
# Site:Storage:Scion 12   3166   263.8   0.893 0.5611  
# Residuals          40  11817   295.4                 
# ---
# Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# 
# $PC7
#                    Df Sum Sq Mean Sq F value Pr(>F)  
# Site                2     85    42.7   0.157 0.8556  
# Storage             1    448   448.2   1.645 0.2070  
# Scion               6   2285   380.9   1.398 0.2393  
# Site:Storage        2   2000   999.9   3.670 0.0344 *
# Site:Scion         12   2462   205.1   0.753 0.6925  
# Storage:Scion       6    983   163.9   0.602 0.7273  
# Site:Storage:Scion 12   3081   256.7   0.942 0.5166  
# Residuals          40  10898   272.4                 
# ---
# Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# 
# $PC8
#                    Df Sum Sq Mean Sq F value Pr(>F)
# Site                2    110    55.1   0.197  0.822
# Storage             1    549   548.5   1.963  0.169
# Scion               6    750   125.1   0.448  0.842
# Site:Storage        2    334   167.1   0.598  0.555
# Site:Scion         12   1153    96.1   0.344  0.975
# Storage:Scion       6   1856   309.4   1.108  0.375
# Site:Storage:Scion 12   3233   269.4   0.964  0.497
# Residuals          40  11175   279.4               
# 
# $PC9
#                    Df Sum Sq Mean Sq F value  Pr(>F)   
# Site                2    291   145.4   0.624 0.54101   
# Storage             1    120   120.0   0.515 0.47731   
# Scion               6    658   109.6   0.470 0.82619   
# Site:Storage        2   3474  1737.2   7.452 0.00177 **
# Site:Scion         12   1215   101.2   0.434 0.93963   
# Storage:Scion       6    606   101.0   0.433 0.85225   
# Site:Storage:Scion 12    970    80.9   0.347 0.97417   
# Residuals          40   9324   233.1                   
# ---
# Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# 
# $PC10
#                    Df Sum Sq Mean Sq F value  Pr(>F)   
# Site                2    131    65.4   0.396 0.67572   
# Storage             1    365   365.5   2.213 0.14466   
# Scion               6   1060   176.7   1.070 0.39635   
# Site:Storage        2   2202  1100.8   6.666 0.00317 **
# Site:Scion         12   2043   170.2   1.031 0.44086   
# Storage:Scion       6   1947   324.4   1.965 0.09381 . 
# Site:Storage:Scion 12   1980   165.0   0.999 0.46711   
# Residuals          40   6605   165.1                   
# ---
# Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Percent variation in first 10 PCs for each factor

# Extract PC scores as a list of dataframes
pcas <- lapply(pca_summary, function(i) data.frame(unclass(i)))

# Merge into single dataframe
pcs_factors_tidy <- lapply(
  names(pcas),
  function(name) {
    pcas[[name]] %>%
    mutate(
      PC = name, #substring(name, 3),
      Factor = gsub(" ", "", rownames(pcas[[name]])),
      var = Sum.Sq / sum(pcas[[name]]$Sum.Sq) * 100,
      pc_var = subset(pca_var, PC == name)$"perc_var",
      total_var = var * pc_var / 100,
      sig = case_when(
        is.na(Pr..F.) ~ "",
        Pr..F. < 0.001 ~ "***",
        Pr..F. < 0.01 ~ "**",
        Pr..F. < 0.05 ~ "*",
        TRUE ~ ""
      ),
      variance = ifelse(
        total_var < 0.01, paste0("<0.01", sig),
        paste0(round(total_var, 2), sig)
      )
    )
  }
) %>% bind_rows() %>% data.table()

# Order PCs and factors
pcs_factors_tidy$PC <- factor(pcs_factors_tidy$PC, levels = paste0("PC", 1:n_pcs))
pcs_factors_tidy$Factor <- factor(pcs_factors_tidy$Factor, levels = unique(pcs_factors_tidy$Factor))

# Significant factors
pcs_factors_tidy[
  Pr..F. < 0.05, 
  c("PC", "Factor", "Df", "F.value", "Pr..F.", "var", "pc_var", "total_var")
]
#       PC       Factor Df    F.value       Pr..F.       var pc_var  total_var
#  1:  PC1         Site  2 224.072105 1.863103e-22 83.058689   27.1 22.5089047
#  2:  PC2         Site  2 276.643717 3.766143e-24 87.680340   21.2 18.5882321
#  3:  PC3        Scion  6   2.662540 2.871400e-02 15.692236    8.4  1.3181478
#  4:  PC3 Site:Storage  2   8.620505 7.709993e-04 16.935585    8.4  1.4225892
#  5:  PC4         Site  2   5.833732 5.982146e-03 10.610344    4.2  0.4456344
#  6:  PC4   Site:Scion 12   2.521003 1.417543e-02 27.511079    4.2  1.1554653
#  7:  PC5      Storage  1   9.432657 3.823182e-03 11.015539    3.3  0.3635128
#  8:  PC5 Site:Storage  2   6.978586 2.513137e-03 16.299308    3.3  0.5378771
#  9:  PC6      Storage  1   5.069838 2.990656e-02  5.795920    2.6  0.1506939
# 10:  PC6        Scion  6   2.681842 2.779054e-02 18.395545    2.6  0.4782842
# 11:  PC7 Site:Storage  2   3.670184 3.440069e-02  8.991247    1.9  0.1708337
# 12:  PC9 Site:Storage  2   7.452304 1.774287e-03 20.856557    1.7  0.3545615
# 13: PC10 Site:Storage  2   6.666103 3.172553e-03 13.479336    1.6  0.2156694
# Table with factors as columns and PCs as rows
# pcs_factors <- dcast(pcs_factors_tidy, PC ~ Factor, value.var = "variance")
pcs_factors <- pcs_factors_tidy %>%
  select(PC, pc_var, Factor, variance) %>%
  spread(key = Factor, value = variance)

pcs_factors
#      PC pc_var     Site Storage Scion Site:Storage Site:Scion Storage:Scion
# 1   PC1   27.1 22.51***   <0.01   0.5          0.3       0.99          0.18
# 2   PC2   21.2 18.59***    0.02   0.2         0.08        0.6           0.1
# 3   PC3    8.4     0.26   <0.01 1.32*      1.42***       1.04          0.79
# 4   PC4    4.2   0.45**    0.05  0.27         0.05      1.16*          0.15
# 5   PC5    3.3    <0.01  0.36**  0.13       0.54**       0.23          0.18
# 6   PC6    2.6    <0.01   0.15* 0.48*         0.08       0.21          0.17
# 7   PC7    1.9    <0.01    0.04   0.2        0.17*       0.21          0.08
# 8   PC8    1.8     0.01    0.05  0.07         0.03       0.11          0.17
# 9   PC9    1.7     0.03    0.01  0.07       0.35**       0.12          0.06
# 10 PC10    1.6     0.01    0.04   0.1       0.22**        0.2          0.19
#    Site:Storage:Scion Residuals
# 1                 0.6      2.01
# 2                0.27      1.34
# 3                0.28       3.3
# 4                0.54      1.53
# 5                0.31      1.54
# 6                0.32      1.19
# 7                0.26      0.93
# 8                 0.3      1.05
# 9                 0.1      0.95
# 10               0.19      0.65

PCA plot

fun_pca_plot <- plotOrd(
  fun_pca,
  colData(dds),
  design = "Site",
  shapes = "Storage",
  axes = c(1, 2),
  cbPalette = T,
  alpha = 0.75,
) # + facet_wrap(~facet) 
# WARNING: Incorrect columns specified "Site"WARNING: Incorrect columns specified "Storage"
#   geom_line(aes(group=facet),alpha=0.25,linetype=3,colour="#000000") + 
#   theme(text = element_text(size=14))

ggsave(filename = "fun_pca_plot.png", plot = fun_pca_plot, path = "figures/")

fun_pca_plot

PCA sum of squares (% var)

sum_squares <- apply(mypca$x, 2 ,function(x) 
  summary(aov(update(formula, x ~ .), data = cbind(x, colData(dds))))[[1]][2]
)
# Error in h(simpleError(msg, call)): error in evaluating the argument 'object' in selecting a method for function 'summary': object 'Site' not found
sum_squares <- do.call(cbind, sum_squares)
x <- t(apply(sum_squares, 2, prop.table))
perVar <- x * mypca$percentVar
#colSums(perVar)
round(colSums(perVar) / sum(colSums(perVar)) * 100, 3)
# [1] 39.142  0.967  5.153  1.378 11.491  3.852  8.268 29.749

ADONIS

# Calculate Bray-Curtis distance matrix
vg <- vegdist(t(counts(dds, normalize = T)), method = "bray")

formula <- update(FULL_DESIGN, vg ~ .)

set.seed(sum(utf8ToInt("Hamish McLean")))
result <- adonis2(formula, colData(dds), permutations = 1000)
# Error in eval(predvars, data, env): object 'Site' not found
result
# Permutation test for adonis under reduced model
# Terms added sequentially (first to last)
# Permutation: free
# Number of permutations: 1000
# 
# adonis2(formula = formula, data = colData(dds), permutations = 1000)
#                    Df SumOfSqs      R2       F   Pr(>F)    
# Site                2   4.6757 0.31606 19.4091 0.000999 ***
# Storage             1   0.2240 0.01514  1.8596 0.056943 .  
# Scion               6   0.9812 0.06633  1.3577 0.062937 .  
# Site:Storage        2   0.7189 0.04859  2.9841 0.002997 ** 
# Site:Scion         12   1.4891 0.10066  1.0302 0.406593    
# Storage:Scion       6   0.6866 0.04641  0.9500 0.556444    
# Site:Storage:Scion 12   1.2000 0.08112  0.8302 0.891109    
# Residual           40   4.8180 0.32569                     
# Total              81  14.7934 1.00000                     
# ---
# Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
df <- result %>% data.frame()
df$Perc.Var <- df$SumOfSqs / df["Total", "SumOfSqs"] * 100
df
#                    Df   SumOfSqs         R2          F      Pr..F.   Perc.Var
# Site                2  4.6756525 0.31606401 19.4090620 0.000999001  31.606401
# Storage             1  0.2239849 0.01514090  1.8595639 0.056943057   1.514090
# Scion               6  0.9812079 0.06632754  1.3576946 0.062937063   6.632754
# Site:Storage        2  0.7188806 0.04859478  2.9841392 0.002997003   4.859478
# Site:Scion         12  1.4890925 0.10065944  1.0302266 0.406593407  10.065944
# Storage:Scion       6  0.6865717 0.04641076  0.9500073 0.556443556   4.641076
# Site:Storage:Scion 12  1.1999721 0.08111552  0.8301990 0.891108891   8.111552
# Residual           40  4.8180097 0.32568705         NA          NA  32.568705
# Total              81 14.7933719 1.00000000         NA          NA 100.000000

NMDS ordination

set.seed(sum(utf8ToInt("Hamish McLean")))
ord <- metaMDS(vg,trace=0) 
#sratmax=20000,maxit=20000,try = 177, trymax = 177

fun_nmds <- scores(ord)

fun_nmds_plot <- plotOrd(
  fun_nmds, colData(dds), 
  design = "Site", 
  shape = "Storage", 
  alpha = 0.75, cbPalette = T
) #+ theme(text = element_text(size = 14))
# WARNING: Incorrect columns specified "Site"WARNING: Incorrect columns specified "Storage"
ggsave(filename = "fun_nmds_plot.png", plot = fun_nmds_plot, path = "figures/")

fun_nmds_plot

ASV abundance

Explore distribution of ASV read counts

# Extract normalised counts from DESeq object
asv_counts <- counts(dds, normalize = T) %>% as.data.frame()

# Sum ASV counts across samples
total_asv_counts <- rowSums(asv_counts)

# Sort ASVs by abundance
total_asv_counts <- total_asv_counts[order(total_asv_counts, decreasing = T)]

# Caculate cumulative percentage
cumulative <- data.frame(
  cumulative = cumsum(total_asv_counts) / sum(total_asv_counts) * 100,
  no = seq_along(total_asv_counts)
)

# Plot cumulative percentage of ASVs
fun_cum_asv <- ggline(
  data = cumulative, x = "no", y = "cumulative", 
  plot_type = "l", palette = cbPalette,
  title = "Cumulative percentage of fungal ASVs", xlab = "Number of ASVs", 
  ylab = "Cumulative percentage of reads"
)
ggsave(filename = "fun_cum_asv.png", plot = fun_cum_asv, path = "figures/")
fun_cum_asv

# Find the number of ASVs that account for 50%, 80%, and 99% of total reads
cat(
  "Number of ASVs that account for 50%, 80%, and 99% of total reads", "\n\n",
  "50%:", sum(cumulative <= 50), "\n",
  "80%:", sum(cumulative <= 80), "\n",
  "99%:", sum(cumulative <= 99), "\n"
)
# Number of ASVs that account for 50%, 80%, and 99% of total reads 
# 
#  50%: 57 
#  80%: 140 
#  99%: 741
# Find the cumulative percentage accounted for by top x ASVs
cat(
  "Percentage of total reads accounted for by the top 100, 200,and 500 ASVs:", "\n\n",
  "100:", round(cumulative[cumulative$no == 100, "cumulative"], 1) , "\n",
  "200:", round(cumulative[cumulative$no == 200, "cumulative"], 1) , "\n",
  "500:", round(cumulative[cumulative$no == 500, "cumulative"], 1) , "\n"
)
# Percentage of total reads accounted for by the top 100, 200,and 500 ASVs: 
# 
#  100: 85.9 
#  200: 92.6 
#  500: 98.1
# Average ASV counts in order
mean_asv_counts <- rowMeans(asv_counts)
mean_asv_counts <- mean_asv_counts[order(mean_asv_counts, decreasing = T)]

# Plot read count distribution
fun_asv_counts <- ggline(
  data = data.frame(ASV = seq_along(mean_asv_counts), counts = mean_asv_counts),
  x = "ASV", y = "counts", plot_type = "l",
  title = "Fungal ASV read count distribution", xlab = "ASV", ylab = "Mean read count"
)
ggsave(filename = "fun_asv_counts.png", plot = fun_asv_counts, path = "figures/")
fun_asv_counts

# Number of ASVs with mean read count > 100, 200, and 500
cat(
  "Number of ASVs with mean read count > 100, 200, and 500", "\n\n",
  "100:", sum(rowMeans(asv_counts) > 100), "\n",
  "200:", sum(rowMeans(asv_counts) > 200), "\n",
  "500:", sum(rowMeans(asv_counts) > 500), "\n"
)
# Number of ASVs with mean read count > 100, 200, and 500 
# 
#  100: 147 
#  200: 86 
#  500: 49

Filter top ASVs with mean read count > 100

# Filter the top x abundant ASVs by the sum of their normalised counts
# top_asvs <- asv_counts[order(rowSums(asv_counts), decreasing = T)[1:DIFFOTU], ]

# Filter ASVs with mean read count > 100
top_asvs <- asv_counts[rowMeans(asv_counts) > 100, ]

# Check that sample names match
identical(names(top_asvs), rownames(colData))
# [1] TRUE
# Extract taxonomic data for top ASVs
top_taxa <- taxData[rownames(top_asvs), ]

# Log transform normalised counts
# top_asvs <- log10(top_asvs + 1)

top_asv_data <- data.frame(t(top_asvs))
top_asv_ids <- rownames(top_asvs)

# Check that sample names match
identical(rownames(top_asv_data), rownames(colData))
# [1] TRUE
# Add sample metadata to top ASV data
top_asv_data <- merge(top_asv_data, colData, by = 0) %>% column_to_rownames("Row.names")

Effect of design factors on abundance of top ASVs

Effect of Site, Scion, and Storage on abundance of top 200 ASVs

# ANOVA of top ASVs
asv_lm_anova <- function(asv, formula, data) {
  f = update(formula, paste0("log(", asv, " + 1) ~ ."))
  a = aov(f, data = data) 
  a = a %>% summary() %>% unclass() %>% data.frame()
  return(a)
}

# Negative binomial regression model
asv_negbin_anova <- function(asv, formula, data) {
  f = update(formula, paste0(asv, " ~ ."))
  m = glm.nb(f, data = data)
  a = anova(m, test = "Chisq") %>% data.frame()
  return(a)
}

formula <- FULL_DESIGN

# Full design model does not converge
# formula <- y ~ site + Scion + Storage + site:Scion + site:Storage

# Extend ANOVA results with ASV metadata
extend_asv_anova <- function(anova_result, asv) {
  anova_result %>% mutate(
    ASV = asv,
    Taxonomy = taxData[asv, "rank"],
    Abundance = round(mean(top_asv_data[[asv]])),
    Factor = gsub(" ", "", rownames(anova_result)),
    perc_var = Sum.Sq / sum(anova_result$Sum.Sq) * 100,
    sig = case_when(
      is.na(Pr..F.) ~ "",
      Pr..F. < 0.001 ~ "***",
      Pr..F. < 0.01 ~ "**",
      Pr..F. < 0.05 ~ "*",
      TRUE ~ ""
    ),
    Variance = ifelse(
      perc_var < 0.01, paste0("<0.01", sig),
      paste0(round(perc_var, 2), sig)
    )
  )
}

# Perform ANOVA on list of top ASVs
top_asvs_anova_results <- lapply(
  top_asv_ids, 
  function(asv) {
    asv_lm_anova(asv, formula, top_asv_data) %>%
    extend_asv_anova(asv)
  }
) %>% bind_rows() %>% data.table()
# Error in eval(predvars, data, env): object 'Site' not found
# Group by factor and adjust p-values
top_asvs_anova_results <- top_asvs_anova_results %>% 
  group_by(Factor) %>% 
  mutate(p.adj = p.adjust(`Pr..F.`, method = "BH")) %>% 
  data.table()

# Order factors by original order
top_asvs_anova_results$Factor <- factor(top_asvs_anova_results$Factor, levels = unique(top_asvs_anova_results$Factor))

# Summary of top ASV ANOVA results
top_asvs_anova_summary <- top_asvs_anova_results %>% 
  select(ASV, Taxonomy, Abundance, Factor, Variance) %>% 
  spread(key = Factor, value = Variance) %>%
  data.table()

top_asvs_anova_summary
#         ASV               Taxonomy Abundance     Site Storage    Scion
#  1:    ASV1        Streptomyces(g) 1700.0037 28.73***    0.05     3.02
#  2:   ASV10    Acidimicrobiales(o)  348.6716  13.45**     0.8  19.01**
#  3: ASV1043        Streptomyces(g)  145.7812 27.68***    0.04     3.89
#  4:   ASV11        Actinoplanes(g)  362.5498 57.01***    0.83   8.78**
#  5:   ASV12         Pseudomonas(g)  344.1967   10.44*    3.95    13.96
#  6:  ASV124   Micromonosporales(o)  120.5557 39.92***   <0.01     5.55
#  7:   ASV13 Gammaproteobacteria(c)  401.3884 55.78***    0.65     5.14
#  8:   ASV14  Micromonosporaceae(f)  337.7555 56.68***    0.28     5.48
#  9:  ASV147      Actinobacteria(c)  114.0728 35.46***    0.16   11.93*
# 10:   ASV15      Actinocorallia(g)  345.7558 61.09***    0.04    6.95*
# 11:   ASV16      Acidimicrobiia(c)  233.2312 53.48***   <0.01     7.35
# 12:   ASV17    Mycobacteriaceae(f)  336.7446 63.95***    0.15  8.08***
# 13:   ASV18     Cellvibrionales(o)  239.5032 16.69***    0.25    11.82
# 14:   ASV19             Erwinia(g)  228.8789  39.5***    0.96     6.57
# 15:    ASV2      Kineosporiales(o) 1516.6692  19.1***    0.13   19.22*
# 16:   ASV20      Actinobacteria(c)  412.4092 78.84***     0.5     1.78
# 17: ASV2064      Actinocorallia(g)  107.8827 53.93***     0.1    8.55*
# 18:   ASV21        Streptomyces(g)  315.5804 36.15***     0.1     3.35
# 19:   ASV22           Rhizobium(g)  292.4708     5.65    0.03     8.89
# 20: ASV2209    Streptomycetales(o)  117.0966 37.67***    0.45        4
# 21:   ASV23       Mycobacterium(g)  280.6984 54.35***    0.03     7.1*
# 22:   ASV24        Arthrobacter(g)  355.3985 16.15***  4.99**   11.22*
# 23:   ASV25         Pseudomonas(g)  287.0610 18.54***   6.26*    12.54
# 24:   ASV26        Streptomyces(g)  317.6134 49.23***   <0.01     4.17
# 25:   ASV27  Micromonosporaceae(f)  175.7534 43.52***    0.38    5.69*
# 26:   ASV28       Amycolatopsis(g)  305.0520 57.08***    0.12 11.85***
# 27:   ASV29        Streptomyces(g)  287.4622 39.42***   <0.01     5.45
# 28:    ASV3     Kineosporiaceae(f) 1387.9818 31.94***    0.17     9.67
# 29:   ASV30        Streptomyces(g)  194.7991 51.85***   2.25*     5.91
# 30:   ASV31        Streptomyces(g)  262.3983 42.65***    0.34     3.04
# 31:   ASV32        Streptomyces(g)  250.3974 52.64***   <0.01     6.61
# 32:   ASV33      Bradyrhizobium(g)  214.6676 60.57***    0.04    5.94*
# 33:   ASV34 Gammaproteobacteria(c)  163.6252 17.83***     0.6   16.63*
# 34:  ASV340        Streptomyces(g)  119.0172 25.65***    0.47     2.19
# 35: ASV3482        Streptomyces(g)  112.5339 36.02***    0.01     7.18
# 36:   ASV35 Solirubrobacterales(o)  153.5592 22.55***    0.25    14.43
# 37:   ASV36       Bacillaceae_1(f)  167.0648 20.73***    0.36  14.84**
# 38:   ASV37    Sphingomonadales(o)  178.1947    8.53*    0.85    11.19
# 39:   ASV38       Mycobacterium(g)  159.5277 37.99***    0.05   11.27*
# 40:   ASV39        Streptomyces(g)  145.2649 45.47***    0.21     2.27
# 41:    ASV4        Streptomyces(g) 1216.1449 38.85***    0.26     1.97
# 42:   ASV40     Mycobacteriales(o)  176.0332 55.81***    0.51    7.82*
# 43:   ASV41        Yersiniaceae(f)  146.2637 28.91***    3.77     0.46
# 44:   ASV42 Gammaproteobacteria(c)  124.9541 14.98***    0.35   16.58*
# 45:   ASV43 Solirubrobacterales(o)  156.1030 64.17***    0.21    7.74*
# 46:   ASV44       Aquabacterium(g)  118.4572 45.69***    0.08     3.92
# 47:   ASV46      Bradyrhizobium(g)  119.4612 32.98***    0.02     7.82
# 48:   ASV47          Variovorax(g)  112.3346   10.73*    2.99     6.64
# 49:   ASV48 Gammaproteobacteria(c)  126.8827   10.48*    0.07       13
# 50:   ASV49     Novosphingobium(g)  140.2648 31.51***    0.05  13.07**
# 51:    ASV5        Streptomyces(g) 1182.6737 43.71***    0.13     3.36
# 52:   ASV50          Dokdonella(g)  110.2877  48.4***    0.69     4.14
# 53:   ASV51  Rhodanobacteraceae(f)  113.1663 18.95***    0.34    12.02
# 54:   ASV54      Comamonadaceae(f)  122.1397 65.96***    0.42     2.36
# 55:   ASV55      Comamonadaceae(f)  137.4410 63.52***    0.55     1.88
# 56:   ASV56 Gammaproteobacteria(c)  118.5193 33.11***    0.15     8.38
# 57:   ASV58 Thermomonosporaceae(f)  118.7442 53.85***    0.05    7.25*
# 58:   ASV59 Streptosporangiales(o)  103.7496 42.95***    0.28     6.77
# 59: ASV5965   Streptomycetaceae(f)  134.5568  64.4***    0.02     3.84
# 60:    ASV6        Streptomyces(g)  970.9627 49.18***    0.27     6.22
# 61:   ASV61           Kribbella(g)  100.7063 34.63***    1.56  11.88**
# 62:   ASV63        Povalibacter(g)  197.9080 30.97***     0.5   11.65*
# 63:   ASV64      Actinobacteria(c)  392.8443 24.07***    0.83     6.96
# 64:   ASV65            Nocardia(g)  100.3498 53.35***   2.61*     6.11
# 65:    ASV7      Bradyrhizobium(g)  625.7187 13.23***     0.5  18.28**
# 66:   ASV70           Kribbella(g)  102.5546 37.87***    0.21  11.18**
# 67:    ASV8        Actinoplanes(g)  604.0953  62.8***    0.82    4.12*
# 68:   ASV83      Steroidobacter(g)  127.8547     1.08    0.01 25.68***
# 69:   ASV88            Nocardia(g)  106.5701 75.76***    0.18      2.7
# 70:    ASV9          Nonomuraea(g)  409.5173     3.44    1.95     7.95
# 71:  ASV944        Streptomyces(g)  125.6294 57.47***    0.04     2.33
#         ASV               Taxonomy Abundance     Site Storage    Scion
#     Site:Storage Site:Scion Storage:Scion Site:Storage:Scion Residuals
#  1:      10.97**      14.56          5.57               7.43     29.66
#  2:         4.57      11.69          9.96               5.64     34.89
#  3:      12.12**       9.29          6.83               6.12     34.02
#  4:     10.66***       3.05           2.8               2.38     14.49
#  5:         4.07       5.23           2.8              10.47     49.07
#  6:     17.24***       3.67          4.92               4.48     24.22
#  7:       5.63**       8.69          2.04               5.66     16.41
#  8:      9.36***       5.76          4.87               1.47      16.1
#  9:      12.3***        5.4          2.55               4.03     28.18
# 10:        2.47*       6.54          3.03               5.42     14.45
# 11:         1.74       9.01           2.3               5.12     20.99
# 12:      5.82***       4.63          2.54               4.09     10.74
# 13:      11.42**       7.19          4.76              12.79     35.09
# 14:         1.24       3.64          3.36               7.11     37.62
# 15:        7.63*       6.71          4.31               3.22     39.68
# 16:         0.85       4.73          0.79               2.02     10.49
# 17:        6.5**       3.58          4.27               2.98      20.1
# 18:         1.99      14.21          3.61               8.79      31.8
# 19:         6.49      15.93          9.25               6.63     47.13
# 20:      10.97**       7.65          4.85               5.27     29.13
# 21:         1.26        7.2          4.33                6.1     19.62
# 22:     17.29***       4.02          6.86              12.88     26.58
# 23:         3.21       7.11          2.63               9.51      40.2
# 24:       6.22**       7.24          1.53              11.24     20.37
# 25:     10.92***     11.76*          2.14               9.35     16.23
# 26:        3.98*       3.31          4.08               3.21     16.36
# 27:       9.08**      11.39           6.1               3.14     25.41
# 28:         6.4*       9.21          4.18               6.42     32.02
# 29:        4.42*     11.94*          2.23               1.62     19.78
# 30:        5.72*      11.17           0.7               6.14     30.24
# 31:        4.62*       5.26          4.29               2.22     24.36
# 32:       5.89**       7.08           1.2                3.6     15.68
# 33:         5.65       5.36           6.8                7.3     39.84
# 34:     14.48***      16.75          6.19               4.63     29.63
# 35:     10.92***        8.3          6.73               5.61     25.22
# 36:         0.85       6.78          4.58               8.04     42.53
# 37:        5.63*       8.08          8.72               11.6     30.04
# 38:        9.2**      17.89          6.52              10.66     35.17
# 39:         2.49       7.43         9.07*               7.87     23.84
# 40:         5.8*       9.72          3.89               6.17     26.47
# 41:     11.57***       7.87          5.36               9.29     24.83
# 42:         1.71       5.35          5.08               5.63      18.1
# 43:         4.04       2.29          6.99               9.12     44.43
# 44:         5.01      14.82          8.53               4.16     35.57
# 45:         1.86       3.23          3.37               3.45     15.98
# 46:       8.53**        4.5          5.91               4.29     27.08
# 47:         4.61      14.35          2.27               3.13     34.83
# 48:         1.08      12.65           2.9              11.73     51.28
# 49:        8.81*       9.03          4.59              12.01     42.02
# 50:        6.61*      11.28          5.24               6.65      25.6
# 51:     13.96***      13.2*          2.66               4.53     18.44
# 52:       7.82**       8.91          2.76               1.49      25.8
# 53:         6.73       5.86          9.77               4.46     41.85
# 54:      8.12***       5.59          0.66               3.14     13.75
# 55:        3.53*       8.33          0.67               4.32     17.19
# 56:        6.79*        6.9          3.51               8.16     32.99
# 57:      7.21***       5.08          4.53               7.24      14.8
# 58:         2.44      12.23          4.13               5.91      25.3
# 59:       5.41**       6.82          1.47                2.9     15.16
# 60:       7.34**      12.29          2.35               1.74     20.59
# 61:       9.08**       7.53          7.94               3.97     23.42
# 62:        6.99*       7.06          7.72               4.18     30.94
# 63:      10.88**       8.79           4.6               3.51     40.36
# 64:        3.41*       5.71          4.92               4.74     19.16
# 65:     14.72***      11.14          8.41               2.68     31.04
# 66:     17.38***       7.14          4.75                3.1     18.38
# 67:      11.2***       4.15        6.31**               1.46      9.13
# 68:       9.95**      13.74          9.19               5.77     34.58
# 69:         0.33       5.93          1.01               2.03     12.07
# 70:     18.98***      11.16         12.14               6.34     38.04
# 71:      9.15***       7.79          1.94               4.06     17.23
#     Site:Storage Site:Scion Storage:Scion Site:Storage:Scion Residuals
cat(
  "Number of ASVs with statistically significant (*P* < 0.05) adjusted p-values", "\n\n",
  "Site:", nrow(top_asvs_anova_results[Factor == "Site" & p.adj < 0.05, ]), "\n",
  "Storage:", nrow(top_asvs_anova_results[Factor == "Storage" & p.adj < 0.05, ]), "\n",
  "Scion:", nrow(top_asvs_anova_results[Factor == "Scion" & p.adj < 0.05, ]), "\n",
  "Site:Storage:", nrow(top_asvs_anova_results[Factor == "Site:Storage" & p.adj < 0.05, ]), "\n",
  "Site:Scion:", nrow(top_asvs_anova_results[Factor == "Site:Scion" & p.adj < 0.05, ]), "\n",
  "Storage:Scion:", nrow(top_asvs_anova_results[Factor == "Storage:Scion" & p.adj < 0.05, ]), "\n",
  "Site:Storage:Scion:", nrow(top_asvs_anova_results[Factor == "Site:Storage:Scion" & p.adj < 0.05, ]), "\n\n",
  "Total ASVs:", length(unique(top_asvs_anova_results$ASV)), "\n\n"
)
# Number of ASVs with statistically significant (*P* < 0.05) adjusted p-values 
# 
#  Site: 65 
#  Storage: 1 
#  Scion: 14 
#  Site:Storage: 36 
#  Site:Scion: 0 
#  Storage:Scion: 1 
#  Site:Storage:Scion: 0 
# 
#  Total ASVs: 71
# Filter by significant effect of scion and its interactions
scion_asvs <- top_asvs_anova_results[grepl("Scion", Factor) & p.adj < 0.05, ]

scion_asvs
#     Df    Sum.Sq  Mean.Sq  F.value       Pr..F.   ASV               Taxonomy
#  1:  6  6.462817 1.077136 3.632478 0.0057001310 ASV10    Acidimicrobiales(o)
#  2:  6 10.164244 1.694041 4.038754 0.0029567506 ASV11        Actinoplanes(g)
#  3:  6 13.711397 2.285233 3.205277 0.0115314268 ASV15      Actinocorallia(g)
#  4:  6  9.239304 1.539884 5.014190 0.0006493282 ASV17    Mycobacteriaceae(f)
#  5:  6 15.526618 2.587770 3.228422 0.0110957163  ASV2      Kineosporiales(o)
#  6:  6 16.024984 2.670831 4.828893 0.0008602464 ASV28       Amycolatopsis(g)
#  7:  6 25.717650 4.286275 3.293104 0.0099653617 ASV36       Bacillaceae_1(f)
#  8:  6  6.662841 1.110474 3.228302 0.0110979402 ASV43 Solirubrobacterales(o)
#  9:  6  6.477019 1.079503 3.404861 0.0082828400 ASV49     Novosphingobium(g)
# 10:  6 11.385314 1.897552 3.266670 0.0104123121 ASV58 Thermomonosporaceae(f)
# 11:  6 10.646995 1.774499 3.380295 0.0086257899 ASV61           Kribbella(g)
# 12:  6  6.128484 1.021414 3.925699 0.0035442496  ASV7      Bradyrhizobium(g)
# 13:  6  8.955213 1.492536 4.056045 0.0028762031 ASV70           Kribbella(g)
# 14:  6 14.890921 2.481820 4.610217 0.0012037794  ASV8        Actinoplanes(g)
# 15:  6 14.755142 2.459190 4.950265 0.0007152424 ASV83      Steroidobacter(g)
#     Abundance        Factor  perc_var sig Variance       p.adj
#  1:  348.6716         Scion 19.008777  **  19.01** 0.027240049
#  2:  362.5498         Scion  8.779256  **   8.78** 0.015149537
#  3:  345.7558         Scion  6.948747   *    6.95* 0.049406199
#  4:  336.7446         Scion  8.075128 ***  8.08*** 0.004137386
#  5: 1516.6692         Scion 19.217052   *   19.22* 0.047962403
#  6:  305.0520         Scion 11.853513 *** 11.85*** 0.005151114
#  7:  167.0648         Scion 14.839456  **  14.84** 0.044619683
#  8:  156.1030         Scion  7.736463   *    7.74* 0.047962403
#  9:  140.2648         Scion 13.074252  **  13.07** 0.038835580
# 10:  118.7442         Scion  7.250910   *    7.25* 0.045795744
# 11:  100.7063         Scion 11.875110  **  11.88** 0.039694607
# 12:  625.7187         Scion 18.278756  **  18.28** 0.017964146
# 13:  102.5546         Scion 11.179662  **  11.18** 0.014890343
# 14:  604.0953 Storage:Scion  6.314704  **   6.31** 0.007038569
# 15:  127.8547         Scion 25.675544 *** 25.68*** 0.004443443
cat(
  length(scion_asvs$ASV), 
  "ASVs with significant (*P* < 0.05) adjusted p-values for the effect of Scion and its interactions.", "\n\n"
)
# 15 ASVs with significant (*P* < 0.05) adjusted p-values for the effect of Scion and its interactions.
# Summary of ASVs with significant Scion effect
top_asvs_anova_summary[ASV %in% scion_asvs$ASV, ]
#       ASV               Taxonomy Abundance     Site Storage    Scion
#  1: ASV10    Acidimicrobiales(o)  348.6716  13.45**     0.8  19.01**
#  2: ASV11        Actinoplanes(g)  362.5498 57.01***    0.83   8.78**
#  3: ASV15      Actinocorallia(g)  345.7558 61.09***    0.04    6.95*
#  4: ASV17    Mycobacteriaceae(f)  336.7446 63.95***    0.15  8.08***
#  5:  ASV2      Kineosporiales(o) 1516.6692  19.1***    0.13   19.22*
#  6: ASV28       Amycolatopsis(g)  305.0520 57.08***    0.12 11.85***
#  7: ASV36       Bacillaceae_1(f)  167.0648 20.73***    0.36  14.84**
#  8: ASV43 Solirubrobacterales(o)  156.1030 64.17***    0.21    7.74*
#  9: ASV49     Novosphingobium(g)  140.2648 31.51***    0.05  13.07**
# 10: ASV58 Thermomonosporaceae(f)  118.7442 53.85***    0.05    7.25*
# 11: ASV61           Kribbella(g)  100.7063 34.63***    1.56  11.88**
# 12:  ASV7      Bradyrhizobium(g)  625.7187 13.23***     0.5  18.28**
# 13: ASV70           Kribbella(g)  102.5546 37.87***    0.21  11.18**
# 14:  ASV8        Actinoplanes(g)  604.0953  62.8***    0.82    4.12*
# 15: ASV83      Steroidobacter(g)  127.8547     1.08    0.01 25.68***
#     Site:Storage Site:Scion Storage:Scion Site:Storage:Scion Residuals
#  1:         4.57      11.69          9.96               5.64     34.89
#  2:     10.66***       3.05           2.8               2.38     14.49
#  3:        2.47*       6.54          3.03               5.42     14.45
#  4:      5.82***       4.63          2.54               4.09     10.74
#  5:        7.63*       6.71          4.31               3.22     39.68
#  6:        3.98*       3.31          4.08               3.21     16.36
#  7:        5.63*       8.08          8.72               11.6     30.04
#  8:         1.86       3.23          3.37               3.45     15.98
#  9:        6.61*      11.28          5.24               6.65      25.6
# 10:      7.21***       5.08          4.53               7.24      14.8
# 11:       9.08**       7.53          7.94               3.97     23.42
# 12:     14.72***      11.14          8.41               2.68     31.04
# 13:     17.38***       7.14          4.75                3.1     18.38
# 14:      11.2***       4.15        6.31**               1.46      9.13
# 15:       9.95**      13.74          9.19               5.77     34.58
# Export significant ASVs as fasta

# Read FUN ASVs
FUN_asvs <- read.fasta("data/FUN.zotus.fa")
# Replace 'OTU' with 'ASV' in sequence names
names(FUN_asvs) <- gsub("OTU", "ASV", names(FUN_asvs))

# Write significant ASVs to fasta
write.fasta(
  sequences = FUN_asvs[scion_asvs$ASV],
  names = paste(scion_asvs$ASV, taxData[scion_asvs$ASV, "rank"]),
  file = "fasta/FUN_scion_asvs.fasta"
)

Canker counts

Testing the effects of of total abundance, ASV abundance, α-diversity, and β-diversity on canker counts.

This uses a nested negative binomial regression model.

The base model for canker counts uses the formula: Cankers ~ Site * Storage * Scion.

# Filter out samples with missing canker count
canker_abundance_data <- colData[complete.cases(colData$Cankers), ]

# Base model
canker_design <- "Cankers ~ Site * Storage * Scion"
base_model <- glm.nb(canker_design, data = canker_abundance_data)
# Error in eval(predvars, data, env): object 'Site' not found
# Abundance model
abundance_design <- paste(canker_design, "+ log(copy_number)")
abundance_model <- glm.nb(abundance_design, data = canker_abundance_data)
# Error in eval(predvars, data, env): object 'Site' not found
# ANOVA of abundance with canker count
kable(anova(base_model, abundance_model))
Model theta Resid. df 2 x log-lik. Test df LR stat. Pr(Chi)
Site * Storage * Scion 2.882191 33 -554.6118 NA NA NA
Site * Storage * Scion + log(copy_number) 2.980708 32 -551.9342 1 vs 2 1 2.677645 0.1017661

Effect of ASV abundance on canker count

# Filter out samples with missing canker count
canker_top_asv_data <- top_asv_data[complete.cases(top_asv_data$Cankers), ]
# all.equal(top_asv_data[c("Site", "Storage", "Scion", "Cankers")],cankers)

# Base model design
canker_design <- "Cankers ~ Site * Storage * Scion"

# Base model with ASV abundance data
base_model <- glm.nb(canker_design, data = canker_top_asv_data)
# Error in eval(predvars, data, env): object 'Site' not found
# Fits glm.nb model and returns a list of the model and warnings
glm.nb_with_warnings <- function(f, data) {
  m = tryCatch(
    {
      list(
        fit = glm.nb(f, data = data),
        warning = NA
      )
    },
    warning = function(w) {
      list(
        fit = glm.nb(f, data = data),
        warning = paste(conditionMessage(w), collapse = ", ")
      )
    }
  )
  return(m)
}

# ANOVA of top ASVs with canker count
asv_canker_anova <- function(asv, design, base_model, data) {
  log_asv = paste0("log(", asv, " + 1)")
  f = paste(design, "+", log_asv)#, "+", log_asv, ":Site")
  m = glm.nb_with_warnings(f, data)
  a = anova(base_model, m$fit) %>% data.frame()
  b = suppressWarnings(anova(m$fit)) %>% data.frame()
  total_deviance = sum(b$Deviance, na.rm = T) + tail(b$Resid..Dev, 1)
  d = data.frame(
    ASV = asv,
    Taxonomy = taxData[asv, "rank"],
    Abundance = mean(data[[asv]]),
    coef = m$fit$coefficients[log_asv],
    var = b[log_asv, 'Deviance'] / total_deviance * 100,
    p = a[2, 'Pr.Chi.'],
    warning = m$warning
  )
  return(d)
}

# kable(asv_canker_anova(top_asv_ids[1], canker_design, base_model, canker_top_asv_data))

# Effect of ASV abundance on canker count for top ASVs
asv_canker_results <- sapply(
  top_asv_ids, 
  function(x) asv_canker_anova(x, canker_design, base_model, canker_top_asv_data)
) %>% t() %>% data.table()
# Error in h(simpleError(msg, call)): error in evaluating the argument 'x' in selecting a method for function 't': object 'Site' not found
# Adjust p-values for multiple testing
asv_canker_results$p_adjusted <- p.adjust(asv_canker_results$p, method = "BH")

# Summary of ASVs with statistically significant (*P* < 0.05) adjusted p-values
cat(
  nrow(asv_canker_results[p_adjusted < 0.05, ]), "of", nrow(asv_canker_results),
  "ASVs have statistically significant (*P* < 0.05) adjusted p-values\n\n"
)
# 1 of 71 ASVs have statistically significant (*P* < 0.05) adjusted p-values
if(nrow(asv_canker_results[p_adjusted < 0.05, ]) > 0) {
  asv_canker_results[p_adjusted < 0.05, ]
}
#      ASV           Taxonomy Abundance       coef      var            p warning
# 1: ASV40 Mycobacteriales(o)  156.6922 -0.5353163 1.070081 0.0004620826      NA
#    p_adjusted
# 1: 0.03280786

Effect of ASV abundance on canker count per site

# For each site, select ASVs with mean abundance > 100
top_asvs_per_site <- lapply(
  unique(colData$Site),
  function(site) {
    samples <- filter(colData, Site == site)
    top_asv_data <- select(asv_counts, rownames(samples))
    top_asvs <- filter(top_asv_data, rowMeans(top_asv_data) > 100)
    top_asv_ids <- rownames(top_asvs)
    top_asvs <- data.frame(t(top_asvs)) %>% merge(samples, by = 0) %>% column_to_rownames("Row.names")
    top_asvs <- top_asvs[complete.cases(top_asvs$Cankers), ]
    return(list(asvs = top_asv_ids, data = top_asvs))
  }
)

cat(
  "Sites 1, 2, 3 contained",
  paste(sapply(top_asvs_per_site, function(x) length(x$asvs)), collapse = ", "),
  "ASVs respectively with mean read count > 100", "\n\n"
)
# Sites 1, 2, 3 contained  ASVs respectively with mean read count > 100
canker_site_design <- "Cankers ~ Storage * Scion"

# ANOVA of ASV abundance with canker count per ASV
asv_canker_site_anova <- function(asvs, data) {
  base_model <- glm.nb(canker_site_design, data = data)
  results <- sapply(
    asvs, 
    function(asv) asv_canker_anova(asv, canker_site_design, base_model, data)
  ) %>% t() %>% data.table()
  results$p_adjusted <- p.adjust(results$p, method = "BH")
  return(results)
}

# Run ANOVA per site
asv_canker_site_results <- lapply(
  top_asvs_per_site,
  function(x) asv_canker_site_anova(x$asvs, x$data)
)

# Add site to each result as new column and merge into single data.table
asv_canker_site_results <- lapply(
  1:3, 
  function(site) {
    result <- asv_canker_site_results[[site]]
    result$Site <- site
    result
  }
) %>% bind_rows()
# Error in asv_canker_site_results[[site]]: subscript out of bounds
# Significant ASVs
significant_asvs <- asv_canker_site_results[p_adjusted < 0.05 & is.na(warning), ]
# Error in eval(expr, envir, enclos): object 'p_adjusted' not found
significant_asvs
#         ASV               Taxonomy Abundance      coef      var            p
#  1:    ASV2      Kineosporiales(o)  670.9308 0.6280854 6.996585  0.001849117
#  2:   ASV27  Micromonosporaceae(f)  320.4859 0.4042703 2.946708    0.0047199
#  3:    ASV3     Kineosporiaceae(f)   780.185 0.6215193 10.36089  0.001205674
#  4:    ASV4        Streptomyces(g)  400.1423  0.945169 2.159619  2.24083e-05
#  5:    ASV7      Bradyrhizobium(g)  458.6015 0.5898238   3.7398  0.006447539
#  6:   ASV81 Steroidobacteraceae(f)  102.7877 0.4771125 3.930028  0.005419897
#  7: ASV1043        Streptomyces(g)  125.6528 -1.625193 12.86475 0.0003196356
#  8:  ASV124   Micromonosporales(o)  228.8759  -1.12016 7.142329   0.00166798
#  9:  ASV226        Streptomyces(g)  133.4371 -0.890692 11.51696  0.004341775
# 10:   ASV26        Streptomyces(g)   334.252 -1.176118 11.11247  0.006943457
# 11:  ASV497  Micromonosporaceae(f)  117.4129 -2.338883 11.94193 7.702484e-06
# 12: ASV5732        Streptomyces(g)  107.9548 -1.099433 14.36652 0.0007480738
#     warning   p_adjusted Site
#  1:      NA 0.0240385172    2
#  2:      NA 0.0419090058    2
#  3:      NA 0.0235106395    2
#  4:      NA 0.0008739235    2
#  5:      NA 0.0419090058    2
#  6:      NA 0.0419090058    2
#  7:      NA 0.0062328945    3
#  8:      NA 0.0144558263    3
#  9:      NA 0.0282215361    3
# 10:      NA 0.0386849740    3
# 11:      NA 0.0004245090    3
# 12:      NA 0.0093448412    3
# Export significant ASVs as FASTA
write.fasta(
  sequences = FUN_asvs[as.character(significant_asvs$ASV)],
  names = paste(significant_asvs$ASV, taxData[as.character(significant_asvs$ASV), "rank"]),
  file = "fasta/FUN_canker_asvs.fasta"
)
Plot of ASV abundance against canker count
# List of significant ASVs
significant_asv_list <- significant_asvs$ASV %>% unlist()

significant_asv_data <- asv_counts[significant_asv_list, ] %>% 
  t() %>% 
  data.frame() %>% 
  merge(colData, by = 0) %>% 
  column_to_rownames("Row.names") %>%
  select(c(significant_asv_list, "Site", "Storage", "Scion", "Cankers"))
# Error in `select()`:
# ! Can't subset columns that don't exist.
# ✖ Columns `ASV1043` and `ASV5732` don't exist.
# Melt data for ggplot
significant_asv_long_data <- significant_asv_data %>% reshape2::melt(
  id.vars = c("Site", "Storage", "Scion", "Cankers"), variable.name = "ASV", value.name = "Abundance"
)

# Log trasnform abundance
significant_asv_long_data$log10_abundance <- log10(significant_asv_long_data$Abundance + 1)

fun_asv_canker_plot <- ggscatter(
  data = significant_asv_long_data, x = "log10_abundance", y = "Cankers", 
  color = "Storage", facet.by = c("ASV", "Site"),
  xlab = "ASV abundance (log10)", ylab = "Canker count",
  palette = cbPalette, legend = "bottom"
)

ggsave(
  filename = "fun_asv_canker_plot.png", plot = fun_asv_canker_plot, path = "figures/",
  height = 40, width = 20, units = "cm"
)

fun_asv_canker_plot

Effect of aggregated genera on canker counts

# Add genus from taxData to countData
fun_genus_data <- counts(dds, normalize = T) %>% as.data.frame() %>% mutate(
  genus = taxData[rownames(countData), "genus"]
)

# Group by genus
fun_genus_data <- fun_genus_data %>% group_by(genus) %>% summarise_all(sum) %>% as.data.frame()

# Set rownames as genus
rownames(fun_genus_data) <- fun_genus_data$genus
fun_genus_data <- dplyr::select(fun_genus_data, -genus)

# Filter genera with mean abundance < 100
fun_genus_data <- fun_genus_data[rowMeans(fun_genus_data) > 10, ]

# Rank not genus
not_genus <- rownames(fun_genus_data)[grep("\\([a-z]\\)", rownames(fun_genus_data))]
# Remove rows with genus in not_genus
fun_genus_data <- fun_genus_data[!rownames(fun_genus_data) %in% not_genus, ]
cat(
  length(not_genus), "non-genus ranks removed:\n\n",
  not_genus, "\n"
)
# 1 non-genus ranks removed:
# 
#  Clavicipitaceae(f)
# Final genus list
fun_genera <- rownames(fun_genus_data)

# Transpose and add metadata from colData
fun_genus_data <- t(fun_genus_data) %>% as.data.frame() %>% mutate(
  Site = colData[rownames(.), "Site"],
  Storage = colData[rownames(.), "Storage"],
  Scion = colData[rownames(.), "Scion"],
  Cankers = colData[rownames(.), "Cankers"]
)

# Filter out samples with missing canker count
fun_genus_data <- fun_genus_data[complete.cases(fun_genus_data$Cankers), ]
# Base model design
canker_design = "Cankers ~ Site * Storage * Scion"

# Base model with genus abundance data
base_model <- glm.nb(canker_design, data = fun_genus_data)
# Error in eval(predvars, data, env): object 'Site' not found
# ANOVA of genus abundance with canker count
genus_canker_anova <- function(genus, design, base_model, data) {
  log_genus = paste0("log(", genus, " + 1)")
  f = paste(design, "+", log_genus)#, "+", log_genus, ":Site")
  m = glm.nb(f, data = data)
  a = anova(base_model, m) %>% data.frame()
  b = suppressWarnings(anova(m)) %>% data.frame()
  total_deviance = sum(b$Deviance, na.rm = T) + tail(b$Resid..Dev, 1)
  d = data.table(
    Genus = genus,
    Abundance = mean(data[[genus]]),
    coef = m$coefficients[log_genus],
    var = b[log_genus, 'Deviance'] / total_deviance * 100,
    p = a[2, 'Pr.Chi.']
  )
  return(d)
}

# genus_canker_anova(fun_genera[1], canker_design, base_model, fun_genus_data)

# Effect of genera abundance on canker counts
genus_canker_results <- sapply(
  fun_genera, 
  function(x) genus_canker_anova(x, canker_design, base_model, fun_genus_data)
) %>% t() %>% data.table()
# Error in h(simpleError(msg, call)): error in evaluating the argument 'x' in selecting a method for function 't': object 'Site' not found
# Adjust p-values for multiple testing
genus_canker_results$p_adjusted <- p.adjust(genus_canker_results$p, method = "BH")

# Summary of genera with statistically significant (*P* < 0.05) adjusted p-values
cat(
  nrow(genus_canker_results[p_adjusted < 0.05, ]), "of", nrow(genus_canker_results),
  "genera have statistically significant (*P* < 0.05) adjusted p-values\n\n"
)
# 2 of 238 genera have statistically significant (*P* < 0.05) adjusted p-values
if(nrow(genus_canker_results[p_adjusted < 0.05, ]) > 0) {
  genus_canker_results[p_adjusted < 0.05, ]
}
#               Genus Abundance       coef      var            p   p_adjusted
# 1: Marisediminicola  11.01665 -0.7908458 1.442856 1.264145e-05 1.504332e-03
# 2:    Pedomicrobium  22.65009  0.7736918 1.317756 3.761193e-07 8.951639e-05
sig_genus <- genus_canker_results[p_adjusted < 0.05]$Genus %>% unlist()

for (genus in sig_genus) {
  log_genus = paste0("log(", genus, " + 1)")
  f = paste(canker_design, "+", log_genus)
  m = glm.nb(f, data = fun_genus_data)
  print(anova(base_model, m))
}
# Error in eval(predvars, data, env): object 'Site' not found
Plot of genus abundance against canker count
significant_genera <- genus_canker_results[p_adjusted < 0.05]$Genus %>% unlist()

significant_genera_data <- fun_genus_data[, c(significant_genera, FACTORS, "Cankers")]
# Error in `[.data.frame`(fun_genus_data, , c(significant_genera, FACTORS, : undefined columns selected
# Melt data for ggplot
significant_genera_data <- significant_genera_data %>% reshape2::melt(
  id.vars = c(FACTORS, "Cankers"), variable.name = "Genus", value.name = "Abundance"
)

# Log transform abundance
significant_genera_data$log10_abundance <- log(significant_genera_data$Abundance + 1)
# Error in significant_genera_data$Abundance + 1: non-numeric argument to binary operator
# significant_genera_data$log10_cankers <- log(significant_genera_data$Cankers + 1)

# Plot of genus abundance against canker count
fun_genus_canker_plot <- ggscatter(
  data = significant_genera_data, x = "log10_abundance", y = "Cankers", 
  color = "Site", shape = "Storage",
  xlab = "Genus abundance (log10)", ylab = "Canker count",
  free_x = T, free_y = T, palette = cbPalette
) + facet_wrap(~ Genus, scales = "free")

ggsave(
  filename = "fun_genus_canker_plot.png", plot = fun_genus_canker_plot, path = "figures/",
  height = 10, width = 20, units = "cm"
)
# Error:
# ! Problem while converting geom to grob.
# ℹ Error occurred in the 1st layer.
# Caused by error in `UseMethod()`:
# ! no applicable method for 'rescale' applied to an object of class "character"

Effect of α-diversity on canker count

# ANOVA of α-diversity with canker count

# Base model with α-diversity data
base_model <- glm.nb(canker_design, data = all_alpha_ord)
# Error in eval(predvars, data, env): object 'Site' not found
measures <- c("S.chao1", "shannon", "simpson")

# ANOVA of α-diversity with canker count
alpha_canker_anova <- function(measure, data) {
  f = paste(canker_design, "+", measure)
  m = glm.nb(f, data = data)
  a = anova(base_model, m) %>% data.frame()
  b = anova(m) %>% data.frame()
  total_deviance = sum(b$Deviance, na.rm = T) + tail(b$Resid..Dev, 1)
  d = data.frame(
    measure = measure,
    coef = m$coefficients[measure],
    var = b[measure, 'Deviance'] / total_deviance * 100,
    p = a[2, 'Pr.Chi.']
  )
  return(d)
}

# alpha_canker_anova("shannon", all_alpha_ord)

# Effect of α-diversity on canker count for each measure
alpha_canker_results <- data.table(t(sapply(measures, function(x) alpha_canker_anova(x, all_alpha_ord))))
# Error in h(simpleError(msg, call)): error in evaluating the argument 'x' in selecting a method for function 't': object 'Site' not found
alpha_canker_results
#    measure         coef       var          p
# 1: S.chao1 0.0003260492 0.1949192  0.3631458
# 2: shannon     1.047428  3.043469 0.03415997
# 3: simpson     74.20115  2.554056 0.01184907
# ANOVA results
for (measure in measures) {
  f = paste(canker_design, "+", measure)
  m = glm.nb(f, data = all_alpha_ord)
  print(anova(base_model, m))
}
# Error in eval(predvars, data, env): object 'Site' not found

Effect of β-diversity on canker count

no_pcs <- 4

# Merge PC scores with canker data
pc_scores <- merge(colData, data.frame(mypca$x[, 1:no_pcs]), by = "row.names") %>% 
  column_to_rownames("Row.names")

pcs <- tail(colnames(pc_scores), no_pcs)

# Base model with β-diversity data
base_model <- glm.nb(canker_design, data = pc_scores)
# Error in eval(predvars, data, env): object 'Site' not found
# ANOVA of β-diversity with canker count
beta_canker_anova <- function(pc, data) {
  f = paste0(canker_design, "+", pc)
  m = glm.nb(f, data = data)
  a = anova(base_model, m) %>% data.frame()
  b = anova(m) %>% data.frame()
  total_deviance = sum(b$Deviance, na.rm = T) + tail(b$Resid..Dev, 1)
  d = data.frame(
    PC = pc,
    coef = m$coefficients[pc],
    var = b[pc, 'Deviance'] / total_deviance * 100,
    p = a[2, 'Pr.Chi.']
  )
  return(d)
}

# Effect of β-diversity on canker count for each PC
beta_canker_results <- data.table(t(sapply(pcs, function(x) beta_canker_anova(x, pc_scores))))
# Error in h(simpleError(msg, call)): error in evaluating the argument 'x' in selecting a method for function 't': object 'Site' not found
beta_canker_results
#     PC         coef        var            p
# 1: PC1  -0.02475561  0.2245606 0.0001345953
# 2: PC2 -0.007178287 0.02763263    0.3213615
# 3: PC3 -0.003666729  0.3145477    0.4859172
# 4: PC4   0.01570856  0.2983212  0.009036866
# Save environment
save.image("FUN.RData")

Bacteria

# Unpack bacteria data
invisible(mapply(assign, names(ubiome_BAC), ubiome_BAC, MoreArgs = list(envir = globalenv())))

ASV and sample summary

Read and sample summary

cat(
  "Raw reads", "\n\n",
  "Total raw reads:\t\t", sum(countData), "\n",
  "Mean raw reads per sample:\t", mean(colSums(countData)), "\n",
  "Median raw reads per sample:\t", median(colSums(countData)), "\n",
  "Max raw reads per sample:\t", max(colSums(countData)), "\n",
  "Min raw reads per sample:\t", min(colSums(countData)), "\n\n"
)
# Raw reads 
# 
#  Total raw reads:      3365816 
#  Mean raw reads per sample:    41046.54 
#  Median raw reads per sample:  40406.5 
#  Max raw reads per sample:     89023 
#  Min raw reads per sample:     15049
#colSums(countData)

nct <- counts(dds, normalize = T)
cat("Normalised reads", "\n\n",
  "Total normalised reads:\t\t", sum(nct), "\n",
  "Mean normalised reads per sample:\t", mean(colSums(nct)), "\n",
  "Median normalised reads per sample:\t", median(colSums(nct)), "\n",
  "Min normalised reads per sample:\t", min(colSums(nct)), "\n",
  "Max normalised reads per sample:\t", max(colSums(nct)), "\n\n"
)
# Normalised reads 
# 
#  Total normalised reads:       4585124 
#  Mean normalised reads per sample:     55916.14 
#  Median normalised reads per sample:   53407.32 
#  Min normalised reads per sample:  9940.045 
#  Max normalised reads per sample:  139399.1
#round(colSums(counts(dds,normalize = T)),0)

ASV summary

cat(
  "Total ASVs:\t\t", nrow(taxData),"\n\n",
  "Raw reads per ASV summary", "\n\n",
  "Mean raw reads per ASV:\t", mean(rowSums(countData)),"\n",
  "Median raw per ASV:\t\t", median(rowSums(countData)),"\n",
  "ASV raw Min reads:\t\t", min(rowSums(countData)),"\n",
  "ASV raw Max reads:\t\t", max(rowSums(countData)),"\n\n"
)
# Total ASVs:        5883 
# 
#  Raw reads per ASV summary 
# 
#  Mean raw reads per ASV:   572.1258 
#  Median raw per ASV:       120 
#  ASV raw Min reads:        34 
#  ASV raw Max reads:        106398
cat(
  "Normalised reads per ASV summary","\n\n",
  "Mean normalised reads per ASV:\t\t", mean(rowSums(nct)),"\n",
  "Median normalised reads per ASV:\t", median(rowSums(nct)),"\n",
  "ASV normalised Min reads:\t\t", min(rowSums(nct)),"\n",
  "ASV normalised Max reads:\t\t", max(rowSums(nct)),"\n\n"
)
# Normalised reads per ASV summary 
# 
#  Mean normalised reads per ASV:        779.3853 
#  Median normalised reads per ASV:  156.6744 
#  ASV normalised Min reads:         20.59746 
#  ASV normalised Max reads:         139400.3
y <- rowSums(nct)
y <- y[order(y, decreasing = T)]
# proportion
xy <- y/sum(y)

cat("Top ", TOPOTU, "ASVs:\n")
# Top  10 ASVs:
data.frame(counts = y[1:TOPOTU], proportion = xy[1:TOPOTU], rank = taxData[names(y)[1:TOPOTU],]$rank)
#          counts  proportion               rank
# ASV1  139400.30 0.030402734    Streptomyces(g)
# ASV2  124366.87 0.027123994  Kineosporiales(o)
# ASV3  113814.51 0.024822559 Kineosporiaceae(f)
# ASV4   99723.88 0.021749441    Streptomyces(g)
# ASV5   96979.25 0.021150846    Streptomyces(g)
# ASV6   79618.94 0.017364621    Streptomyces(g)
# ASV7   51308.93 0.011190305  Bradyrhizobium(g)
# ASV8   49535.82 0.010803594    Actinoplanes(g)
# ASV20  33817.55 0.007375494  Actinobacteria(c)
# ASV9   33580.42 0.007323775      Nonomuraea(g)

Taxonomy Summary

Taxonomy identifiable

Proportion of ASVs which can be assigned (with the given confidence) at each taxonomic rank

# Proportion of ASVs which can be assigned (with the given confidence) at each taxonomic rank

tx <- copy(taxData)
setDT(tx)
cols <- names(tx)[9:15]

tx[, (cols) := lapply(.SD, as.factor), .SDcols = cols]

data.table(
  rank = c("kingdom", "phylum", "class", "order", "family", "genus", "species"),
  "0.8" = round(unlist(lapply(cols, function(col) sum(as.number(tx[[col]]) >= 0.8) / nrow(tx))), 2),
  "0.65" = round(unlist(lapply(cols, function(col) sum(as.number(tx[[col]]) >= 0.65) / nrow(tx))), 2),
  "0.5" = round(unlist(lapply(cols, function(col) sum(as.number(tx[[col]]) >= 0.5) / nrow(tx))), 2)
)
#       rank  0.8 0.65  0.5
# 1: kingdom 1.00 1.00 1.00
# 2:  phylum 0.94 0.97 0.99
# 3:   class 0.84 0.90 0.93
# 4:   order 0.65 0.72 0.79
# 5:  family 0.51 0.57 0.64
# 6:   genus 0.35 0.43 0.51
# 7: species 0.00 0.00 0.00

% of reads which can be assigned to each taxonomic ranks

tx <-taxData[rownames(dds),]
nc <- counts(dds, normalize = T)
ac <- sum(nc)

data.table(
  rank = c("kingdom", "phylum", "class", "order", "family", "genus", "species"),
  "0.8" = round(unlist(lapply(cols, function(col)(sum(nc[which(as.numeric(tx[[col]]) >= 0.8),]) / ac * 100))), 2),
  "0.65" = round(unlist(lapply(cols, function(col)(sum(nc[which(as.numeric(tx[[col]]) >= 0.65),]) / ac * 100))), 2),
  "0.5" = round(unlist(lapply(cols, function(col)(sum(nc[which(as.numeric(tx[[col]]) >= 0.5),]) / ac * 100))), 2)
)
#       rank   0.8   0.65    0.5
# 1: kingdom 99.95 100.00 100.00
# 2:  phylum 97.78  99.43  99.75
# 3:   class 92.63  95.82  98.05
# 4:   order 71.84  81.25  87.03
# 5:  family 58.63  67.80  73.01
# 6:   genus 44.35  51.20  58.33
# 7: species  0.00   0.00   0.00

Taxonomy plots

Plots of proportion of normalised reads assigned to members of phylum and class.

dat <- list(as.data.frame(counts(dds, normalize = T)), taxData, as.data.frame(colData(dds)))

design <- c("Site", "Storage")

# md1 <- getSummedTaxa(dat, conf = TAXCONF, design = design, cutoff = 0.1)
md1 <- getSummedTaxa(dat, conf = TAXCONF, design = design, taxon = "phylum", cutoff = 0.1)
# Error in `[.data.frame`(obj[[3]], , design, drop = FALSE): undefined columns selected
md1[, Site := factor(Site, levels = c(1, 2, 3))]
md1[, Storage := factor(Storage, levels = c("no", "yes"))]
md1[, taxon := factor(taxon, levels = unique(taxon[order(value, decreasing = T)]))]

removals <- md1[, .(value = mean(value)), by = "taxon"][value < 0.5, taxon]
md1 <- md1[!taxon %in% removals, ]

bac_phylum_plot <- plotfun1(md1, x = "taxon", fill = "Site") +
  facet_wrap(~ Storage)

ggsave("figures/bac_phylum.png", bac_phylum_plot, width = 25, height = 15, units = "cm")

bac_phylum_plot

md2 <- getSummedTaxa(dat, conf = TAXCONF, design = design, taxon = "class", cutoff = 0.1, topn = 9)
# Error in `[.data.frame`(obj[[3]], , design, drop = FALSE): undefined columns selected
md2[, Site := factor(Site, levels = c(1, 2, 3))]
md2[, Storage := factor(Storage, levels = c("no", "yes"))]
md2[, taxon := factor(taxon, levels = unique(taxon[order(value, decreasing = T)]))]

removals <- md2[, .(value = mean(value)), by = "taxon"][value < 0.5, taxon]
md2 <- md2[!taxon %in% removals, ]

bac_class_plot <- plotfun1(md2, x = "taxon", fill = "Site") +
  facet_wrap(~ Storage)

ggsave("figures/bac_class.png", bac_class_plot, width = 25, height = 15, units = "cm")

bac_class_plot

Abundance

abundance_plot <- ggplot(
  data = as.data.frame(colData(dds)), 
  aes(x = Site, y = log_copy_number, colour = Scion, shape = Storage)
) + geom_jitter() + 
  scale_colour_manual(values = cbPalette)

abundance_plot <- ggboxplot(
  data = as.data.frame(colData(dds)), x = "Site", y = "log_copy_number", 
  color = "Scion", add = "jitter", legend = "top", 
  title = "Bacterial abundance", xlab = "Site", ylab = "log10 copy number"
)
# Error in `purrr::pmap()`:
# ℹ In index: 1.
# ℹ With name: log_copy_number.
# Caused by error in `[[<-.data.frame`:
# ! replacement has 0 rows, data has 82
ggsave(
  filename = "bac_abundance.png", plot = abundance_plot, path = "figures/", 
  height = 20, width = 20, units = "cm"
)
# Error in `geom_jitter()`:
# ! Problem while computing aesthetics.
# ℹ Error occurred in the 1st layer.
# Caused by error in `FUN()`:
# ! object 'Site' not found
abundance_plot
# Error in `geom_jitter()`:
# ! Problem while computing aesthetics.
# ℹ Error occurred in the 1st layer.
# Caused by error in `FUN()`:
# ! object 'Site' not found
# Formula for ANOVA
formula <- update(FULL_DESIGN, log_copy_number ~ .)

abundance_anova <- aov(formula, data = as.data.frame(colData(dds)))
# Error in eval(predvars, data, env): object 'Site' not found
# Normality check
par(mfrow = c(2, 2))
plot(abundance_anova)

png("figures/bac_abundance_norm.png", width = 800, height = 600)
par(mfrow = c(2, 2))
plot(abundance_anova)
dev.off()
# png 
#   2
# Results
summary(abundance_anova)
#                    Df Sum Sq Mean Sq F value   Pr(>F)    
# Site                2 1.9657  0.9828  25.302 7.91e-08 ***
# Storage             1 0.0798  0.0798   2.056   0.1594    
# Scion               6 0.5131  0.0855   2.202   0.0628 .  
# Site:Storage        2 0.0768  0.0384   0.989   0.3809    
# Site:Scion         12 0.2677  0.0223   0.574   0.8494    
# Storage:Scion       6 0.1640  0.0273   0.704   0.6484    
# Site:Storage:Scion 12 0.1889  0.0157   0.405   0.9530    
# Residuals          40 1.5538  0.0388                     
# ---
# Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
abundance_results <- abundance_anova %>% summary() %>% unclass() %>% data.frame()
total_variance <- sum(abundance_results$Sum.Sq)
abundance_results$Perc.Var <- abundance_results$Sum.Sq / total_variance * 100

abundance_results
#                    Df     Sum.Sq    Mean.Sq    F.value       Pr..F.  Perc.Var
# Site                2 1.96566003 0.98283002 25.3015018 7.913123e-08 40.867065
# Storage             1 0.07984549 0.07984549  2.0555037 1.594283e-01  1.660028
# Scion               6 0.51312816 0.08552136  2.2016206 6.280900e-02 10.668194
# Site:Storage        2 0.07683263 0.03841631  0.9889711 3.808702e-01  1.597389
# Site:Scion         12 0.26772071 0.02231006  0.5743394 8.494197e-01  5.566049
# Storage:Scion       6 0.16398521 0.02733087  0.7035927 6.484006e-01  3.409335
# Site:Storage:Scion 12 0.18892649 0.01574387  0.4053027 9.530223e-01  3.927877
# Residuals          40 1.55378922 0.03884473         NA           NA 32.304063

Alpha diversity analysis

Alpha diversity plot

# plot alpha diversity - plot_alpha will convert normalised abundances to integer values

bac_alpha_plot <- plot_alpha(
  counts(dds,normalize = F), colData(dds),
  design = "Scion", colour = "Site",
  measures = c("Shannon", "Simpson"),
  type="box"
) + 
  scale_colour_manual(values = cbPalette) + 
  theme(axis.title.x =  element_blank()) + 
  ggtitle("Bacterial α-diversity")

abundance_plot <- ggboxplot(
  data = as.data.frame(colData(dds)), x = "Site", y = "log_copy_number", 
  color = "Scion", add = "jitter", legend = "top", 
  title = "Bacterial abundance", xlab = "Site", ylab = "log10 copy number"
)
# Error in `purrr::pmap()`:
# ℹ In index: 1.
# ℹ With name: log_copy_number.
# Caused by error in `[[<-.data.frame`:
# ! replacement has 0 rows, data has 82
ggsave(
  filename = "bac_alpha.png", plot = bac_alpha_plot, path = "figures/", 
  height = 20, width = 40, units = "cm"
)
# Error in `geom_boxplot()`:
# ! Problem while computing aesthetics.
# ℹ Error occurred in the 1st layer.
# Caused by error in `FUN()`:
# ! object 'Scion' not found
bac_alpha_plot
# Error in `geom_boxplot()`:
# ! Problem while computing aesthetics.
# ℹ Error occurred in the 1st layer.
# Caused by error in `FUN()`:
# ! object 'Scion' not found

Permutation based anova on diversity index ranks

# get the diversity index data
all_alpha_ord <- plot_alpha(
  counts(dds, normalize = F), colData(dds), design = "Site", returnData = T
)

# join diversity indices and metadata
all_alpha_ord <- all_alpha_ord[
  as.data.table(colData(dds), keep.rownames = "Samples"), on = "Samples"
]

bac_alpha <- all_alpha_ord

formula <- FULL_DESIGN

Chao1

setkey(all_alpha_ord, S.chao1)
all_alpha_ord[, measure := as.numeric(as.factor(S.chao1))]
result <- aovp(update(formula, measure ~ .), all_alpha_ord, seqs = T)
# Error in eval(predvars, data, env): object 'Site' not found
summary(result)
#        Df        SumOfSqs             R2                F          
#  Min.   : 1   Min.   : 0.2240   Min.   :0.01514   Min.   : 0.8302  
#  1st Qu.: 2   1st Qu.: 0.7189   1st Qu.:0.04859   1st Qu.: 0.9901  
#  Median : 6   Median : 1.2000   Median :0.08112   Median : 1.3577  
#  Mean   :18   Mean   : 3.2874   Mean   :0.22222   Mean   : 4.0601  
#  3rd Qu.:12   3rd Qu.: 4.6757   3rd Qu.:0.31606   3rd Qu.: 2.4219  
#  Max.   :81   Max.   :14.7934   Max.   :1.00000   Max.   :19.4091  
#                                                   NA's   :2        
#      Pr(>F)        
#  Min.   :0.000999  
#  1st Qu.:0.029970  
#  Median :0.062937  
#  Mean   :0.282575  
#  3rd Qu.:0.481519  
#  Max.   :0.891109  
#  NA's   :2
df <- result %>% summary() %>% unclass() %>% data.frame()
total_variance <- sum(df$R.Sum.Sq)
df$Perc.Var <- df$R.Sum.Sq / total_variance * 100
# Error in `$<-.data.frame`(`*tmp*`, Perc.Var, value = numeric(0)): replacement has 0 rows, data has 7
df
#        X......Df      X...SumOfSqs         X......R2          X......F
# X   Min.   : 1   Min.   : 0.2240   Min.   :0.01514   Min.   : 0.8302  
# X.1 1st Qu.: 2   1st Qu.: 0.7189   1st Qu.:0.04859   1st Qu.: 0.9901  
# X.2 Median : 6   Median : 1.2000   Median :0.08112   Median : 1.3577  
# X.3 Mean   :18   Mean   : 3.2874   Mean   :0.22222   Mean   : 4.0601  
# X.4 3rd Qu.:12   3rd Qu.: 4.6757   3rd Qu.:0.31606   3rd Qu.: 2.4219  
# X.5 Max.   :81   Max.   :14.7934   Max.   :1.00000   Max.   :19.4091  
# X.6         <NA>              <NA>              <NA>       NA's   :2  
#            X....Pr..F.
# X   Min.   :0.000999  
# X.1 1st Qu.:0.029970  
# X.2 Median :0.062937  
# X.3 Mean   :0.282575  
# X.4 3rd Qu.:0.481519  
# X.5 Max.   :0.891109  
# X.6        NA's   :2

Shannon

setkey(all_alpha_ord, shannon)
all_alpha_ord[, measure := as.numeric(as.factor(shannon))]
result <- aovp(update(formula, measure ~ .), all_alpha_ord, seqs = T)
# Error in eval(predvars, data, env): object 'Site' not found
summary(result)
#        Df        SumOfSqs             R2                F          
#  Min.   : 1   Min.   : 0.2240   Min.   :0.01514   Min.   : 0.8302  
#  1st Qu.: 2   1st Qu.: 0.7189   1st Qu.:0.04859   1st Qu.: 0.9901  
#  Median : 6   Median : 1.2000   Median :0.08112   Median : 1.3577  
#  Mean   :18   Mean   : 3.2874   Mean   :0.22222   Mean   : 4.0601  
#  3rd Qu.:12   3rd Qu.: 4.6757   3rd Qu.:0.31606   3rd Qu.: 2.4219  
#  Max.   :81   Max.   :14.7934   Max.   :1.00000   Max.   :19.4091  
#                                                   NA's   :2        
#      Pr(>F)        
#  Min.   :0.000999  
#  1st Qu.:0.029970  
#  Median :0.062937  
#  Mean   :0.282575  
#  3rd Qu.:0.481519  
#  Max.   :0.891109  
#  NA's   :2
df <- result %>% summary() %>% unclass() %>% data.frame()
total_variance <- sum(df$R.Sum.Sq)
df$Perc.Var <- df$R.Sum.Sq / total_variance * 100
# Error in `$<-.data.frame`(`*tmp*`, Perc.Var, value = numeric(0)): replacement has 0 rows, data has 7
df
#        X......Df      X...SumOfSqs         X......R2          X......F
# X   Min.   : 1   Min.   : 0.2240   Min.   :0.01514   Min.   : 0.8302  
# X.1 1st Qu.: 2   1st Qu.: 0.7189   1st Qu.:0.04859   1st Qu.: 0.9901  
# X.2 Median : 6   Median : 1.2000   Median :0.08112   Median : 1.3577  
# X.3 Mean   :18   Mean   : 3.2874   Mean   :0.22222   Mean   : 4.0601  
# X.4 3rd Qu.:12   3rd Qu.: 4.6757   3rd Qu.:0.31606   3rd Qu.: 2.4219  
# X.5 Max.   :81   Max.   :14.7934   Max.   :1.00000   Max.   :19.4091  
# X.6         <NA>              <NA>              <NA>       NA's   :2  
#            X....Pr..F.
# X   Min.   :0.000999  
# X.1 1st Qu.:0.029970  
# X.2 Median :0.062937  
# X.3 Mean   :0.282575  
# X.4 3rd Qu.:0.481519  
# X.5 Max.   :0.891109  
# X.6        NA's   :2

Simpson

setkey(all_alpha_ord, simpson)
all_alpha_ord[, measure := as.numeric(as.factor(simpson))]
result <- aovp(update(formula, measure ~ .), all_alpha_ord, seqs = T)
# Error in eval(predvars, data, env): object 'Site' not found
summary(result)
#        Df        SumOfSqs             R2                F          
#  Min.   : 1   Min.   : 0.2240   Min.   :0.01514   Min.   : 0.8302  
#  1st Qu.: 2   1st Qu.: 0.7189   1st Qu.:0.04859   1st Qu.: 0.9901  
#  Median : 6   Median : 1.2000   Median :0.08112   Median : 1.3577  
#  Mean   :18   Mean   : 3.2874   Mean   :0.22222   Mean   : 4.0601  
#  3rd Qu.:12   3rd Qu.: 4.6757   3rd Qu.:0.31606   3rd Qu.: 2.4219  
#  Max.   :81   Max.   :14.7934   Max.   :1.00000   Max.   :19.4091  
#                                                   NA's   :2        
#      Pr(>F)        
#  Min.   :0.000999  
#  1st Qu.:0.029970  
#  Median :0.062937  
#  Mean   :0.282575  
#  3rd Qu.:0.481519  
#  Max.   :0.891109  
#  NA's   :2
df <- result %>% summary() %>% unclass() %>% data.frame()
total_variance <- sum(df$R.Sum.Sq)
df$Perc.Var <- df$R.Sum.Sq / total_variance * 100
# Error in `$<-.data.frame`(`*tmp*`, Perc.Var, value = numeric(0)): replacement has 0 rows, data has 7
df
#        X......Df      X...SumOfSqs         X......R2          X......F
# X   Min.   : 1   Min.   : 0.2240   Min.   :0.01514   Min.   : 0.8302  
# X.1 1st Qu.: 2   1st Qu.: 0.7189   1st Qu.:0.04859   1st Qu.: 0.9901  
# X.2 Median : 6   Median : 1.2000   Median :0.08112   Median : 1.3577  
# X.3 Mean   :18   Mean   : 3.2874   Mean   :0.22222   Mean   : 4.0601  
# X.4 3rd Qu.:12   3rd Qu.: 4.6757   3rd Qu.:0.31606   3rd Qu.: 2.4219  
# X.5 Max.   :81   Max.   :14.7934   Max.   :1.00000   Max.   :19.4091  
# X.6         <NA>              <NA>              <NA>       NA's   :2  
#            X....Pr..F.
# X   Min.   :0.000999  
# X.1 1st Qu.:0.029970  
# X.2 Median :0.062937  
# X.3 Mean   :0.282575  
# X.4 3rd Qu.:0.481519  
# X.5 Max.   :0.891109  
# X.6        NA's   :2

Beta diversity PCA/NMDS

PCA

# Number of PCs to include
n_pcs <- 10

# perform PC decomposition of DES object
mypca <- des_to_pca(dds)

# to get pca plot axis into the same scale create a dataframe of PC scores multiplied by their variance
bac_pca <- t(data.frame(t(mypca$x) * mypca$percentVar))

formula = FULL_DESIGN

Percent variation in first 10 PCs

# Cumulative percentage of variance explained
pca_cum_var <- data.frame(
  cumulative = cumsum(mypca$percentVar * 100),
  no = 1:length(mypca$percentVar)
)

# Plot cumulative percentage of variance explained
bac_cum_pca <- ggline(
  pca_cum_var, x = "no", y = "cumulative", plot_type = "l",
  xlab = "Number of PCs", ylab = "Cumulative % variance explained",
  title = "Bacteria: cumulative % variance explained by PCs"
)
ggsave(filename = "bac_cum_pca.png", plot = bac_cum_pca, path = "figures/",)
bac_cum_pca

pca_var <- data.frame(
  PC = paste0("PC", 1:n_pcs),
  perc_var = round(mypca$percentVar[1:n_pcs] * 100, 1)
)

pca_var
#      PC perc_var
# 1   PC1     18.6
# 2   PC2     12.7
# 3   PC3      7.1
# 4   PC4      4.0
# 5   PC5      3.0
# 6   PC6      2.1
# 7   PC7      1.8
# 8   PC8      1.6
# 9   PC9      1.3
# 10 PC10      1.3

ANOVA of first 10 PCs

pca_summary <- apply(
  mypca$x[, 1:n_pcs], 2, 
  function(x){
    summary(aov(update(formula, x ~ .), data = as.data.frame(cbind(x, colData(dds)))))
  }
)
# Error in h(simpleError(msg, call)): error in evaluating the argument 'object' in selecting a method for function 'summary': object 'Site' not found
pca_summary
# $PC1
#                    Df Sum Sq Mean Sq F value Pr(>F)    
# Site                2 190646   95323 224.072 <2e-16 ***
# Storage             1     25      25   0.058 0.8106    
# Scion               6   4227     704   1.656 0.1572    
# Site:Storage        2   2566    1283   3.015 0.0603 .  
# Site:Scion         12   8401     700   1.646 0.1176    
# Storage:Scion       6   1527     255   0.598 0.7298    
# Site:Storage:Scion 12   5124     427   1.004 0.4634    
# Residuals          40  17016     425                   
# ---
# Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# 
# $PC2
#                    Df Sum Sq Mean Sq F value Pr(>F)    
# Site                2 137668   68834 276.644 <2e-16 ***
# Storage             1    142     142   0.572  0.454    
# Scion               6   1474     246   0.987  0.447    
# Site:Storage        2    604     302   1.213  0.308    
# Site:Scion         12   4425     369   1.482  0.172    
# Storage:Scion       6    711     119   0.476  0.822    
# Site:Storage:Scion 12   2035     170   0.681  0.759    
# Residuals          40   9953     249                   
# ---
# Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# 
# $PC3
#                    Df Sum Sq Mean Sq F value   Pr(>F)    
# Site                2   2673    1337   1.562 0.222149    
# Storage             1     30      30   0.035 0.851920    
# Scion               6  13665    2278   2.663 0.028714 *  
# Site:Storage        2  14748    7374   8.621 0.000771 ***
# Site:Scion         12  10740     895   1.046 0.428331    
# Storage:Scion       6   8140    1357   1.586 0.176467    
# Site:Storage:Scion 12   2870     239   0.280 0.989466    
# Residuals          40  34217     855                     
# ---
# Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# 
# $PC4
#                    Df Sum Sq Mean Sq F value  Pr(>F)   
# Site                2   5281  2640.3   5.834 0.00598 **
# Storage             1    635   634.6   1.402 0.24336   
# Scion               6   3161   526.9   1.164 0.34456   
# Site:Storage        2    630   315.1   0.696 0.50442   
# Site:Scion         12  13692  1141.0   2.521 0.01418 * 
# Storage:Scion       6   1827   304.5   0.673 0.67221   
# Site:Storage:Scion 12   6439   536.6   1.186 0.32589   
# Residuals          40  18104   452.6                   
# ---
# Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# 
# $PC5
#                    Df Sum Sq Mean Sq F value  Pr(>F)   
# Site                2      5       2   0.006 0.99447   
# Storage             1   4033    4033   9.433 0.00382 **
# Scion               6   1458     243   0.568 0.75299   
# Site:Storage        2   5967    2984   6.979 0.00251 **
# Site:Scion         12   2582     215   0.503 0.90010   
# Storage:Scion       6   1984     331   0.773 0.59554   
# Site:Storage:Scion 12   3480     290   0.678 0.76149   
# Residuals          40  17102     428                   
# ---
# Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# 
# $PC6
#                    Df Sum Sq Mean Sq F value Pr(>F)  
# Site                2     32    16.2   0.055 0.9467  
# Storage             1   1498  1497.8   5.070 0.0299 *
# Scion               6   4754   792.3   2.682 0.0278 *
# Site:Storage        2    811   405.3   1.372 0.2653  
# Site:Scion         12   2040   170.0   0.575 0.8486  
# Storage:Scion       6   1724   287.4   0.973 0.4560  
# Site:Storage:Scion 12   3166   263.8   0.893 0.5611  
# Residuals          40  11817   295.4                 
# ---
# Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# 
# $PC7
#                    Df Sum Sq Mean Sq F value Pr(>F)  
# Site                2     85    42.7   0.157 0.8556  
# Storage             1    448   448.2   1.645 0.2070  
# Scion               6   2285   380.9   1.398 0.2393  
# Site:Storage        2   2000   999.9   3.670 0.0344 *
# Site:Scion         12   2462   205.1   0.753 0.6925  
# Storage:Scion       6    983   163.9   0.602 0.7273  
# Site:Storage:Scion 12   3081   256.7   0.942 0.5166  
# Residuals          40  10898   272.4                 
# ---
# Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# 
# $PC8
#                    Df Sum Sq Mean Sq F value Pr(>F)
# Site                2    110    55.1   0.197  0.822
# Storage             1    549   548.5   1.963  0.169
# Scion               6    750   125.1   0.448  0.842
# Site:Storage        2    334   167.1   0.598  0.555
# Site:Scion         12   1153    96.1   0.344  0.975
# Storage:Scion       6   1856   309.4   1.108  0.375
# Site:Storage:Scion 12   3233   269.4   0.964  0.497
# Residuals          40  11175   279.4               
# 
# $PC9
#                    Df Sum Sq Mean Sq F value  Pr(>F)   
# Site                2    291   145.4   0.624 0.54101   
# Storage             1    120   120.0   0.515 0.47731   
# Scion               6    658   109.6   0.470 0.82619   
# Site:Storage        2   3474  1737.2   7.452 0.00177 **
# Site:Scion         12   1215   101.2   0.434 0.93963   
# Storage:Scion       6    606   101.0   0.433 0.85225   
# Site:Storage:Scion 12    970    80.9   0.347 0.97417   
# Residuals          40   9324   233.1                   
# ---
# Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# 
# $PC10
#                    Df Sum Sq Mean Sq F value  Pr(>F)   
# Site                2    131    65.4   0.396 0.67572   
# Storage             1    365   365.5   2.213 0.14466   
# Scion               6   1060   176.7   1.070 0.39635   
# Site:Storage        2   2202  1100.8   6.666 0.00317 **
# Site:Scion         12   2043   170.2   1.031 0.44086   
# Storage:Scion       6   1947   324.4   1.965 0.09381 . 
# Site:Storage:Scion 12   1980   165.0   0.999 0.46711   
# Residuals          40   6605   165.1                   
# ---
# Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Percent variation in first 10 PCs for each factor

# Extract PC scores as a list of dataframes
pcas <- lapply(pca_summary, function(i) data.frame(unclass(i)))

# Merge into single dataframe
pcs_factors_tidy <- lapply(
  names(pcas),
  function(name) {
    pcas[[name]] %>%
    mutate(
      PC = name, #substring(name, 3),
      Factor = gsub(" ", "", rownames(pcas[[name]])),
      var = Sum.Sq / sum(pcas[[name]]$Sum.Sq) * 100,
      pc_var = subset(pca_var, PC == name)$"perc_var",
      total_var = var * pc_var / 100,
      sig = case_when(
        is.na(Pr..F.) ~ "",
        Pr..F. < 0.001 ~ "***",
        Pr..F. < 0.01 ~ "**",
        Pr..F. < 0.05 ~ "*",
        TRUE ~ ""
      ),
      variance = ifelse(
        total_var < 0.01, paste0("<0.01", sig),
        paste0(round(total_var, 2), sig)
      )
    )
  }
) %>% bind_rows() %>% data.table()

# Order PCs and factors
pcs_factors_tidy$PC <- factor(pcs_factors_tidy$PC, levels = paste0("PC", 1:n_pcs))
pcs_factors_tidy$Factor <- factor(pcs_factors_tidy$Factor, levels = unique(pcs_factors_tidy$Factor))

# Significant factors
pcs_factors_tidy[
  Pr..F. < 0.05, 
  c("PC", "Factor", "Df", "F.value", "Pr..F.", "var", "pc_var", "total_var")
]
#       PC       Factor Df    F.value       Pr..F.       var pc_var  total_var
#  1:  PC1         Site  2 224.072105 1.863103e-22 83.058689   18.6 15.4489162
#  2:  PC2         Site  2 276.643717 3.766143e-24 87.680340   12.7 11.1354032
#  3:  PC3        Scion  6   2.662540 2.871400e-02 15.692236    7.1  1.1141487
#  4:  PC3 Site:Storage  2   8.620505 7.709993e-04 16.935585    7.1  1.2024266
#  5:  PC4         Site  2   5.833732 5.982146e-03 10.610344    4.0  0.4244137
#  6:  PC4   Site:Scion 12   2.521003 1.417543e-02 27.511079    4.0  1.1004432
#  7:  PC5      Storage  1   9.432657 3.823182e-03 11.015539    3.0  0.3304662
#  8:  PC5 Site:Storage  2   6.978586 2.513137e-03 16.299308    3.0  0.4889792
#  9:  PC6      Storage  1   5.069838 2.990656e-02  5.795920    2.1  0.1217143
# 10:  PC6        Scion  6   2.681842 2.779054e-02 18.395545    2.1  0.3863064
# 11:  PC7 Site:Storage  2   3.670184 3.440069e-02  8.991247    1.8  0.1618424
# 12:  PC9 Site:Storage  2   7.452304 1.774287e-03 20.856557    1.3  0.2711352
# 13: PC10 Site:Storage  2   6.666103 3.172553e-03 13.479336    1.3  0.1752314
# Table with factors as columns and PCs as rows
# pcs_factors <- dcast(pcs_factors_tidy, PC ~ Factor, value.var = "variance")
pcs_factors <- pcs_factors_tidy %>%
  select(PC, pc_var, Factor, variance) %>%
  spread(key = Factor, value = variance)

pcs_factors
#      PC pc_var     Site Storage Scion Site:Storage Site:Scion Storage:Scion
# 1   PC1   18.6 15.45***   <0.01  0.34         0.21       0.68          0.12
# 2   PC2   12.7 11.14***    0.01  0.12         0.05       0.36          0.06
# 3   PC3    7.1     0.22   <0.01 1.11*       1.2***       0.88          0.66
# 4   PC4    4.0   0.42**    0.05  0.25         0.05       1.1*          0.15
# 5   PC5    3.0    <0.01  0.33**  0.12       0.49**       0.21          0.16
# 6   PC6    2.1    <0.01   0.12* 0.39*         0.07       0.17          0.14
# 7   PC7    1.8    <0.01    0.04  0.18        0.16*        0.2          0.08
# 8   PC8    1.6    <0.01    0.05  0.06         0.03        0.1          0.16
# 9   PC9    1.3     0.02   <0.01  0.05       0.27**       0.09          0.05
# 10 PC10    1.3     0.01    0.03  0.08       0.18**       0.16          0.15
#    Site:Storage:Scion Residuals
# 1                0.42      1.38
# 2                0.16      0.81
# 3                0.23      2.79
# 4                0.52      1.46
# 5                0.29       1.4
# 6                0.26      0.96
# 7                0.25      0.88
# 8                0.27      0.93
# 9                0.08      0.73
# 10               0.16      0.53

PCA plot

bac_pca_plot <- plotOrd(
  bac_pca,
  colData(dds),
  design = "Site",
  shape = "Storage",
  axes = c(1, 2),
  # facet = "Storage", 
  cbPalette = T,
  alpha = 0.75,
) #+ facet_wrap(~facet)
# WARNING: Incorrect columns specified "Site"WARNING: Incorrect columns specified "Storage"
ggsave(filename = "bac_pca_plot.png", plot = bac_pca_plot, path = "figures/")

bac_pca_plot

bac_pca_3_6_plot <- plotOrd(
  bac_pca,
  colData(dds),
  design = "Scion",
  shape = "Storage",
  axes = c(3, 6), 
  cbPalette = T,
  alpha = 0.75,
)
# WARNING: Incorrect columns specified "Scion"WARNING: Incorrect columns specified "Storage"
ggsave(filename = "bac_pca_3_6_plot.png", plot = bac_pca_3_6_plot, path = "figures/")

bac_pca_3_6_plot

PCA sum of squares (% var)

sum_squares <- apply(mypca$x, 2 ,function(x) 
  summary(aov(update(formula, x ~ .), data = cbind(x, colData(dds))))[[1]][2]
)
# Error in h(simpleError(msg, call)): error in evaluating the argument 'object' in selecting a method for function 'summary': object 'Site' not found
sum_squares <- do.call(cbind, sum_squares)
# Error in do.call(cbind, sum_squares): second argument must be a list
x <- t(apply(sum_squares, 2, prop.table))
perVar <- x * mypca$percentVar
#colSums(perVar)
round(colSums(perVar) / sum(colSums(perVar)) * 100, 3)
# [1] 27.377  1.163  6.201  3.608 11.077  5.262  9.804 35.509

ADONIS

# Calculate Bray-Curtis distance matrix
vg <- vegdist(t(counts(dds, normalize = T)), method = "bray")

formula <- update(FULL_DESIGN, vg ~ .)

set.seed(sum(utf8ToInt("Hamish McLean")))
result <- adonis2(formula, colData(dds), permutations = 1000)
# Error in eval(predvars, data, env): object 'Site' not found
result
# Permutation test for adonis under reduced model
# Terms added sequentially (first to last)
# Permutation: free
# Number of permutations: 1000
# 
# adonis2(formula = formula, data = colData(dds), permutations = 1000)
#                    Df SumOfSqs      R2       F   Pr(>F)    
# Site                2   4.6757 0.31606 19.4091 0.000999 ***
# Storage             1   0.2240 0.01514  1.8596 0.056943 .  
# Scion               6   0.9812 0.06633  1.3577 0.062937 .  
# Site:Storage        2   0.7189 0.04859  2.9841 0.002997 ** 
# Site:Scion         12   1.4891 0.10066  1.0302 0.406593    
# Storage:Scion       6   0.6866 0.04641  0.9500 0.556444    
# Site:Storage:Scion 12   1.2000 0.08112  0.8302 0.891109    
# Residual           40   4.8180 0.32569                     
# Total              81  14.7934 1.00000                     
# ---
# Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
df <- result %>% data.frame()
df$Perc.Var <- df$SumOfSqs / df["Total", "SumOfSqs"] * 100
df
#                    Df   SumOfSqs         R2          F      Pr..F.   Perc.Var
# Site                2  4.6756525 0.31606401 19.4090620 0.000999001  31.606401
# Storage             1  0.2239849 0.01514090  1.8595639 0.056943057   1.514090
# Scion               6  0.9812079 0.06632754  1.3576946 0.062937063   6.632754
# Site:Storage        2  0.7188806 0.04859478  2.9841392 0.002997003   4.859478
# Site:Scion         12  1.4890925 0.10065944  1.0302266 0.406593407  10.065944
# Storage:Scion       6  0.6865717 0.04641076  0.9500073 0.556443556   4.641076
# Site:Storage:Scion 12  1.1999721 0.08111552  0.8301990 0.891108891   8.111552
# Residual           40  4.8180097 0.32568705         NA          NA  32.568705
# Total              81 14.7933719 1.00000000         NA          NA 100.000000

NMDS ordination

set.seed(sum(utf8ToInt("Hamish McLean")))
ord <- metaMDS(vg,trace=0) 
#sratmax=20000,maxit=20000,try = 177, trymax = 177

bac_nmds <- scores(ord)

bac_nmds_plot <- plotOrd(
  bac_nmds, colData(dds), 
  design = "Site", 
  shape = "Storage", 
  alpha = 0.75, cbPalette = T
) #+ theme(text = element_text(size = 14))
# WARNING: Incorrect columns specified "Site"WARNING: Incorrect columns specified "Storage"
ggsave(filename = "fun_nmds_plot.png", plot = bac_nmds_plot, path = "figures/")

bac_nmds_plot

ASV abundance

Explore distribution of ASV counts

# Extract normalised counts from DESeq object
asv_counts <- counts(dds, normalize = T) %>% as.data.frame()

# Sum ASV counts across samples
total_asv_counts <- rowSums(asv_counts)

# Sort ASVs by abundance
total_asv_counts <- total_asv_counts[order(total_asv_counts, decreasing = T)]

# Caculate cumulative percentage
cumulative <- data.frame(
  cumulative = cumsum(total_asv_counts) / sum(total_asv_counts) * 100,
  no = seq_along(total_asv_counts)
)

# Plot cumulative percentage of ASVs
bac_cum_asv <- ggline(
  data = cumulative, x = "no", y = "cumulative", 
  plot_type = "l", palette = cbPalette,
  title = "Cumulative percentage of bacterial ASVs", xlab = "Number of ASVs", 
  ylab = "Cumulative percentage of reads"
)
ggsave(filename = "bac_cum_asv.png", plot = bac_cum_asv, path = "figures/")
bac_cum_asv

# Find the number of ASVs that account for 50%, 80%, and 99% of total reads
cat(
  "Number of ASVs that account for 50%, 80%, and 99% of total reads", "\n\n",
  "50%:", sum(cumulative <= 50), "\n",
  "80%:", sum(cumulative <= 80), "\n",
  "99%:", sum(cumulative <= 99), "\n"
)
# Number of ASVs that account for 50%, 80%, and 99% of total reads 
# 
#  50%: 205 
#  80%: 1055 
#  99%: 5037
# Find the cumulative percentage accounted for by top x ASVs
cat(
  "Percentage of total reads accounted for by the top 100, 200,and 500 ASVs:", "\n\n",
  "100:", round(cumulative[cumulative$no == 100, "cumulative"], 1) , "\n",
  "200:", round(cumulative[cumulative$no == 200, "cumulative"], 1) , "\n",
  "500:", round(cumulative[cumulative$no == 500, "cumulative"], 1) , "\n"
)
# Percentage of total reads accounted for by the top 100, 200,and 500 ASVs: 
# 
#  100: 43.7 
#  200: 53.9 
#  500: 69.1
# Average ASV counts in order
mean_asv_counts <- rowMeans(asv_counts)
mean_asv_counts <- mean_asv_counts[order(mean_asv_counts, decreasing = T)]

# Plot read count distribution
bac_asv_counts <- ggline(
  data = data.frame(ASV = seq_along(mean_asv_counts), counts = mean_asv_counts),
  x = "ASV", y = "counts", plot_type = "l",
  title = "Bacterial ASV read count distribution", xlab = "ASV", ylab = "Mean read count"
)
ggsave(filename = "bac_asv_counts.png", plot = bac_asv_counts, path = "figures/")
bac_asv_counts

# Number of ASVs with mean read count > 100, 200, and 500
cat(
  "Number of ASVs with mean read count > 100, 200, and 500", "\n\n",
  "100:", sum(rowMeans(asv_counts) > 100), "\n",
  "200:", sum(rowMeans(asv_counts) > 200), "\n",
  "500:", sum(rowMeans(asv_counts) > 500), "\n"
)
# Number of ASVs with mean read count > 100, 200, and 500 
# 
#  100: 71 
#  200: 32 
#  500: 8

Filter top ASVs with mean read count > 100

# Filter the top x abundant ASVs by the sum of their normalised counts
# top_asvs <- asv_counts[order(rowSums(asv_counts), decreasing = T)[1:DIFFOTU], ]

# Filter ASVs with mean read count > 100
top_asvs <- asv_counts[rowMeans(asv_counts) > 100, ]

# Check that sample names match
identical(names(top_asvs), rownames(colData))
# [1] TRUE
# Extract taxonomic data for top ASVs
top_taxa <- taxData[rownames(top_asvs), ]

# Log transform normalised counts
# top_asvs <- log10(top_asvs + 1) # Log transform in models instead

top_asv_data <- data.frame(t(top_asvs))
top_asv_ids <- rownames(top_asvs)
identical(rownames(top_asv_data), rownames(colData))
# [1] TRUE
top_asv_data <- merge(top_asv_data, colData, by = 0) %>% column_to_rownames("Row.names")

Effect of design factors on abundance of top ASVs

Effect of Site, Scion, and Storage on abundance of top 200 ASVs

# Perform ANOVA on list of top ASVs
top_asvs_anova_results <- lapply(
  top_asv_ids, 
  function(asv) {
    asv_lm_anova(asv, formula, top_asv_data) %>%
    extend_asv_anova(asv)
  }
) %>% bind_rows() %>% data.table()
# Error in eval(predvars, data, env): object 'Site' not found
# Group by factor and adjust p-values
top_asvs_anova_results <- top_asvs_anova_results %>% 
  group_by(Factor) %>% 
  mutate(p.adj = p.adjust(`Pr..F.`, method = "BH")) %>% 
  data.table()

# Order factors by original order
top_asvs_anova_results$Factor <- factor(top_asvs_anova_results$Factor, levels = unique(top_asvs_anova_results$Factor))

# Summary of top ASV ANOVA results
top_asvs_anova_summary <- top_asvs_anova_results %>% 
  select(ASV, Taxonomy, Abundance, Factor, Variance) %>% 
  spread(key = Factor, value = Variance)

top_asvs_anova_summary
#        ASV               Taxonomy Abundance     Site Storage    Scion
# 1     ASV1        Streptomyces(g) 1700.0037 28.73***    0.05     3.02
# 2    ASV10    Acidimicrobiales(o)  348.6716  13.45**     0.8  19.01**
# 3  ASV1043        Streptomyces(g)  145.7812 27.68***    0.04     3.89
# 4    ASV11        Actinoplanes(g)  362.5498 57.01***    0.83   8.78**
# 5    ASV12         Pseudomonas(g)  344.1967   10.44*    3.95    13.96
# 6   ASV124   Micromonosporales(o)  120.5557 39.92***   <0.01     5.55
# 7    ASV13 Gammaproteobacteria(c)  401.3884 55.78***    0.65     5.14
# 8    ASV14  Micromonosporaceae(f)  337.7555 56.68***    0.28     5.48
# 9   ASV147      Actinobacteria(c)  114.0728 35.46***    0.16   11.93*
# 10   ASV15      Actinocorallia(g)  345.7558 61.09***    0.04    6.95*
# 11   ASV16      Acidimicrobiia(c)  233.2312 53.48***   <0.01     7.35
# 12   ASV17    Mycobacteriaceae(f)  336.7446 63.95***    0.15  8.08***
# 13   ASV18     Cellvibrionales(o)  239.5032 16.69***    0.25    11.82
# 14   ASV19             Erwinia(g)  228.8789  39.5***    0.96     6.57
# 15    ASV2      Kineosporiales(o) 1516.6692  19.1***    0.13   19.22*
# 16   ASV20      Actinobacteria(c)  412.4092 78.84***     0.5     1.78
# 17 ASV2064      Actinocorallia(g)  107.8827 53.93***     0.1    8.55*
# 18   ASV21        Streptomyces(g)  315.5804 36.15***     0.1     3.35
# 19   ASV22           Rhizobium(g)  292.4708     5.65    0.03     8.89
# 20 ASV2209    Streptomycetales(o)  117.0966 37.67***    0.45        4
# 21   ASV23       Mycobacterium(g)  280.6984 54.35***    0.03     7.1*
# 22   ASV24        Arthrobacter(g)  355.3985 16.15***  4.99**   11.22*
# 23   ASV25         Pseudomonas(g)  287.0610 18.54***   6.26*    12.54
# 24   ASV26        Streptomyces(g)  317.6134 49.23***   <0.01     4.17
# 25   ASV27  Micromonosporaceae(f)  175.7534 43.52***    0.38    5.69*
# 26   ASV28       Amycolatopsis(g)  305.0520 57.08***    0.12 11.85***
# 27   ASV29        Streptomyces(g)  287.4622 39.42***   <0.01     5.45
# 28    ASV3     Kineosporiaceae(f) 1387.9818 31.94***    0.17     9.67
# 29   ASV30        Streptomyces(g)  194.7991 51.85***   2.25*     5.91
# 30   ASV31        Streptomyces(g)  262.3983 42.65***    0.34     3.04
# 31   ASV32        Streptomyces(g)  250.3974 52.64***   <0.01     6.61
# 32   ASV33      Bradyrhizobium(g)  214.6676 60.57***    0.04    5.94*
# 33   ASV34 Gammaproteobacteria(c)  163.6252 17.83***     0.6   16.63*
# 34  ASV340        Streptomyces(g)  119.0172 25.65***    0.47     2.19
# 35 ASV3482        Streptomyces(g)  112.5339 36.02***    0.01     7.18
# 36   ASV35 Solirubrobacterales(o)  153.5592 22.55***    0.25    14.43
# 37   ASV36       Bacillaceae_1(f)  167.0648 20.73***    0.36  14.84**
# 38   ASV37    Sphingomonadales(o)  178.1947    8.53*    0.85    11.19
# 39   ASV38       Mycobacterium(g)  159.5277 37.99***    0.05   11.27*
# 40   ASV39        Streptomyces(g)  145.2649 45.47***    0.21     2.27
# 41    ASV4        Streptomyces(g) 1216.1449 38.85***    0.26     1.97
# 42   ASV40     Mycobacteriales(o)  176.0332 55.81***    0.51    7.82*
# 43   ASV41        Yersiniaceae(f)  146.2637 28.91***    3.77     0.46
# 44   ASV42 Gammaproteobacteria(c)  124.9541 14.98***    0.35   16.58*
# 45   ASV43 Solirubrobacterales(o)  156.1030 64.17***    0.21    7.74*
# 46   ASV44       Aquabacterium(g)  118.4572 45.69***    0.08     3.92
# 47   ASV46      Bradyrhizobium(g)  119.4612 32.98***    0.02     7.82
# 48   ASV47          Variovorax(g)  112.3346   10.73*    2.99     6.64
# 49   ASV48 Gammaproteobacteria(c)  126.8827   10.48*    0.07       13
# 50   ASV49     Novosphingobium(g)  140.2648 31.51***    0.05  13.07**
# 51    ASV5        Streptomyces(g) 1182.6737 43.71***    0.13     3.36
# 52   ASV50          Dokdonella(g)  110.2877  48.4***    0.69     4.14
# 53   ASV51  Rhodanobacteraceae(f)  113.1663 18.95***    0.34    12.02
# 54   ASV54      Comamonadaceae(f)  122.1397 65.96***    0.42     2.36
# 55   ASV55      Comamonadaceae(f)  137.4410 63.52***    0.55     1.88
# 56   ASV56 Gammaproteobacteria(c)  118.5193 33.11***    0.15     8.38
# 57   ASV58 Thermomonosporaceae(f)  118.7442 53.85***    0.05    7.25*
# 58   ASV59 Streptosporangiales(o)  103.7496 42.95***    0.28     6.77
# 59 ASV5965   Streptomycetaceae(f)  134.5568  64.4***    0.02     3.84
# 60    ASV6        Streptomyces(g)  970.9627 49.18***    0.27     6.22
# 61   ASV61           Kribbella(g)  100.7063 34.63***    1.56  11.88**
# 62   ASV63        Povalibacter(g)  197.9080 30.97***     0.5   11.65*
# 63   ASV64      Actinobacteria(c)  392.8443 24.07***    0.83     6.96
# 64   ASV65            Nocardia(g)  100.3498 53.35***   2.61*     6.11
# 65    ASV7      Bradyrhizobium(g)  625.7187 13.23***     0.5  18.28**
# 66   ASV70           Kribbella(g)  102.5546 37.87***    0.21  11.18**
# 67    ASV8        Actinoplanes(g)  604.0953  62.8***    0.82    4.12*
# 68   ASV83      Steroidobacter(g)  127.8547     1.08    0.01 25.68***
# 69   ASV88            Nocardia(g)  106.5701 75.76***    0.18      2.7
# 70    ASV9          Nonomuraea(g)  409.5173     3.44    1.95     7.95
# 71  ASV944        Streptomyces(g)  125.6294 57.47***    0.04     2.33
#    Site:Storage Site:Scion Storage:Scion Site:Storage:Scion Residuals
# 1       10.97**      14.56          5.57               7.43     29.66
# 2          4.57      11.69          9.96               5.64     34.89
# 3       12.12**       9.29          6.83               6.12     34.02
# 4      10.66***       3.05           2.8               2.38     14.49
# 5          4.07       5.23           2.8              10.47     49.07
# 6      17.24***       3.67          4.92               4.48     24.22
# 7        5.63**       8.69          2.04               5.66     16.41
# 8       9.36***       5.76          4.87               1.47      16.1
# 9       12.3***        5.4          2.55               4.03     28.18
# 10        2.47*       6.54          3.03               5.42     14.45
# 11         1.74       9.01           2.3               5.12     20.99
# 12      5.82***       4.63          2.54               4.09     10.74
# 13      11.42**       7.19          4.76              12.79     35.09
# 14         1.24       3.64          3.36               7.11     37.62
# 15        7.63*       6.71          4.31               3.22     39.68
# 16         0.85       4.73          0.79               2.02     10.49
# 17        6.5**       3.58          4.27               2.98      20.1
# 18         1.99      14.21          3.61               8.79      31.8
# 19         6.49      15.93          9.25               6.63     47.13
# 20      10.97**       7.65          4.85               5.27     29.13
# 21         1.26        7.2          4.33                6.1     19.62
# 22     17.29***       4.02          6.86              12.88     26.58
# 23         3.21       7.11          2.63               9.51      40.2
# 24       6.22**       7.24          1.53              11.24     20.37
# 25     10.92***     11.76*          2.14               9.35     16.23
# 26        3.98*       3.31          4.08               3.21     16.36
# 27       9.08**      11.39           6.1               3.14     25.41
# 28         6.4*       9.21          4.18               6.42     32.02
# 29        4.42*     11.94*          2.23               1.62     19.78
# 30        5.72*      11.17           0.7               6.14     30.24
# 31        4.62*       5.26          4.29               2.22     24.36
# 32       5.89**       7.08           1.2                3.6     15.68
# 33         5.65       5.36           6.8                7.3     39.84
# 34     14.48***      16.75          6.19               4.63     29.63
# 35     10.92***        8.3          6.73               5.61     25.22
# 36         0.85       6.78          4.58               8.04     42.53
# 37        5.63*       8.08          8.72               11.6     30.04
# 38        9.2**      17.89          6.52              10.66     35.17
# 39         2.49       7.43         9.07*               7.87     23.84
# 40         5.8*       9.72          3.89               6.17     26.47
# 41     11.57***       7.87          5.36               9.29     24.83
# 42         1.71       5.35          5.08               5.63      18.1
# 43         4.04       2.29          6.99               9.12     44.43
# 44         5.01      14.82          8.53               4.16     35.57
# 45         1.86       3.23          3.37               3.45     15.98
# 46       8.53**        4.5          5.91               4.29     27.08
# 47         4.61      14.35          2.27               3.13     34.83
# 48         1.08      12.65           2.9              11.73     51.28
# 49        8.81*       9.03          4.59              12.01     42.02
# 50        6.61*      11.28          5.24               6.65      25.6
# 51     13.96***      13.2*          2.66               4.53     18.44
# 52       7.82**       8.91          2.76               1.49      25.8
# 53         6.73       5.86          9.77               4.46     41.85
# 54      8.12***       5.59          0.66               3.14     13.75
# 55        3.53*       8.33          0.67               4.32     17.19
# 56        6.79*        6.9          3.51               8.16     32.99
# 57      7.21***       5.08          4.53               7.24      14.8
# 58         2.44      12.23          4.13               5.91      25.3
# 59       5.41**       6.82          1.47                2.9     15.16
# 60       7.34**      12.29          2.35               1.74     20.59
# 61       9.08**       7.53          7.94               3.97     23.42
# 62        6.99*       7.06          7.72               4.18     30.94
# 63      10.88**       8.79           4.6               3.51     40.36
# 64        3.41*       5.71          4.92               4.74     19.16
# 65     14.72***      11.14          8.41               2.68     31.04
# 66     17.38***       7.14          4.75                3.1     18.38
# 67      11.2***       4.15        6.31**               1.46      9.13
# 68       9.95**      13.74          9.19               5.77     34.58
# 69         0.33       5.93          1.01               2.03     12.07
# 70     18.98***      11.16         12.14               6.34     38.04
# 71      9.15***       7.79          1.94               4.06     17.23
cat(
  "Number of ASVs with statistically significant (*P* < 0.05) adjusted p-values", "\n\n",
  "Site:", nrow(top_asvs_anova_results[Factor == "Site" & p.adj < 0.05, ]), "\n",
  "Storage:", nrow(top_asvs_anova_results[Factor == "Storage" & p.adj < 0.05, ]), "\n",
  "Scion:", nrow(top_asvs_anova_results[Factor == "Scion" & p.adj < 0.05, ]), "\n",
  "Site:Storage:", nrow(top_asvs_anova_results[Factor == "Site:Storage" & p.adj < 0.05, ]), "\n",
  "Site:Scion:", nrow(top_asvs_anova_results[Factor == "Site:Scion" & p.adj < 0.05, ]), "\n",
  "Storage:Scion:", nrow(top_asvs_anova_results[Factor == "Storage:Scion" & p.adj < 0.05, ]), "\n",
  "Site:Storage:Scion:", nrow(top_asvs_anova_results[Factor == "Site:Storage:Scion" & p.adj < 0.05, ]), "\n\n",
  "Total ASVs:", length(unique(top_asvs_anova_results$ASV)), "\n\n"
)
# Number of ASVs with statistically significant (*P* < 0.05) adjusted p-values 
# 
#  Site: 65 
#  Storage: 1 
#  Scion: 14 
#  Site:Storage: 36 
#  Site:Scion: 0 
#  Storage:Scion: 1 
#  Site:Storage:Scion: 0 
# 
#  Total ASVs: 71
# Filter by significant effect of scion and its interactions
scion_asvs <- top_asvs_anova_results[grepl("Scion", Factor) & p.adj < 0.05, ]

scion_asvs
#     Df    Sum.Sq  Mean.Sq  F.value       Pr..F.   ASV               Taxonomy
#  1:  6  6.462817 1.077136 3.632478 0.0057001310 ASV10    Acidimicrobiales(o)
#  2:  6 10.164244 1.694041 4.038754 0.0029567506 ASV11        Actinoplanes(g)
#  3:  6 13.711397 2.285233 3.205277 0.0115314268 ASV15      Actinocorallia(g)
#  4:  6  9.239304 1.539884 5.014190 0.0006493282 ASV17    Mycobacteriaceae(f)
#  5:  6 15.526618 2.587770 3.228422 0.0110957163  ASV2      Kineosporiales(o)
#  6:  6 16.024984 2.670831 4.828893 0.0008602464 ASV28       Amycolatopsis(g)
#  7:  6 25.717650 4.286275 3.293104 0.0099653617 ASV36       Bacillaceae_1(f)
#  8:  6  6.662841 1.110474 3.228302 0.0110979402 ASV43 Solirubrobacterales(o)
#  9:  6  6.477019 1.079503 3.404861 0.0082828400 ASV49     Novosphingobium(g)
# 10:  6 11.385314 1.897552 3.266670 0.0104123121 ASV58 Thermomonosporaceae(f)
# 11:  6 10.646995 1.774499 3.380295 0.0086257899 ASV61           Kribbella(g)
# 12:  6  6.128484 1.021414 3.925699 0.0035442496  ASV7      Bradyrhizobium(g)
# 13:  6  8.955213 1.492536 4.056045 0.0028762031 ASV70           Kribbella(g)
# 14:  6 14.890921 2.481820 4.610217 0.0012037794  ASV8        Actinoplanes(g)
# 15:  6 14.755142 2.459190 4.950265 0.0007152424 ASV83      Steroidobacter(g)
#     Abundance        Factor  perc_var sig Variance       p.adj
#  1:  348.6716         Scion 19.008777  **  19.01** 0.027240049
#  2:  362.5498         Scion  8.779256  **   8.78** 0.015149537
#  3:  345.7558         Scion  6.948747   *    6.95* 0.049406199
#  4:  336.7446         Scion  8.075128 ***  8.08*** 0.004137386
#  5: 1516.6692         Scion 19.217052   *   19.22* 0.047962403
#  6:  305.0520         Scion 11.853513 *** 11.85*** 0.005151114
#  7:  167.0648         Scion 14.839456  **  14.84** 0.044619683
#  8:  156.1030         Scion  7.736463   *    7.74* 0.047962403
#  9:  140.2648         Scion 13.074252  **  13.07** 0.038835580
# 10:  118.7442         Scion  7.250910   *    7.25* 0.045795744
# 11:  100.7063         Scion 11.875110  **  11.88** 0.039694607
# 12:  625.7187         Scion 18.278756  **  18.28** 0.017964146
# 13:  102.5546         Scion 11.179662  **  11.18** 0.014890343
# 14:  604.0953 Storage:Scion  6.314704  **   6.31** 0.007038569
# 15:  127.8547         Scion 25.675544 *** 25.68*** 0.004443443
cat(
  length(scion_asvs$ASV), 
  "ASVs with significant (*P* < 0.05) adjusted p-values for the effect of Scion and its interactions.", "\n\n"
)
# 15 ASVs with significant (*P* < 0.05) adjusted p-values for the effect of Scion and its interactions.
# Summary of ASVs with significant Scion effect
top_asvs_anova_summary[ASV %in% scion_asvs$ASV, ]
# Error in h(simpleError(msg, call)): error in evaluating the argument 'x' in selecting a method for function '%in%': object 'ASV' not found
# Export significant ASVs as fasta

# Read BAC ASVs
BAC_asvs <- read.fasta("data/BAC.zotus.fa")
# Replace 'OTU' with 'ASV' in sequence names
names(BAC_asvs) <- gsub("OTU", "ASV", names(BAC_asvs))

# Write significant ASVs to fasta
write.fasta(
  sequences = BAC_asvs[scion_asvs$ASV],
  names = paste(scion_asvs$ASV, taxData[scion_asvs$ASV, "rank"]),
  file = "fasta/BAC_scion_asvs.fasta"
)

Canker counts

Testing the effects of of ASV abundance, α-diversity, and β-diversity on canker counts.

This uses a nested negative binomial regression model.

The base model for canker counts uses the formula: Cankers ~ Site * Storage * Scion.

# Filter out samples with missing canker count
canker_abundance_data <- colData[complete.cases(colData$Cankers), ]

# Base model
canker_design = "Cankers ~ Site * Storage * Scion"
base_model <- glm.nb(canker_design, data = canker_abundance_data)
# Error in eval(predvars, data, env): object 'Site' not found
# Abundance model
abundance_design = paste(canker_design, "+ log(copy_number)")
abundance_model <- glm.nb(abundance_design, data = canker_abundance_data)
# Error in eval(predvars, data, env): object 'Site' not found
# ANOVA of abundance with canker count
kable(anova(base_model, abundance_model))
Model theta Resid. df 2 x log-lik. Test df LR stat. Pr(Chi)
Site * Storage * Scion 2.882191 33 -554.6118 NA NA NA
Site * Storage * Scion + log(copy_number) 2.980708 32 -551.9342 1 vs 2 1 2.677645 0.1017661

Effect of ASV abundance on canker count

# Filter out samples with missing canker count
top_asv_data <- top_asv_data[complete.cases(top_asv_data$Cankers), ]

# Base model design
canker_design = "Cankers ~ Site * Storage * Scion"

# Base model with ASV abundance data
base_model <- glm.nb(canker_design, data = top_asv_data)
# Error in eval(predvars, data, env): object 'Site' not found
# kable(asv_canker_anova(top_asv_ids[1], canker_design, base_model, canker_top_asv_data))

# Effect of ASV abundance on canker count for top ASVs
asv_canker_results <- sapply(
  top_asv_ids, 
  function(x) asv_canker_anova(x, canker_design, base_model, top_asv_data)
) %>% t() %>% data.table()
# Error in h(simpleError(msg, call)): error in evaluating the argument 'x' in selecting a method for function 't': object 'Site' not found
# Adjust p-values for multiple testing
asv_canker_results$p_adjusted <- p.adjust(asv_canker_results$p, method = "BH")

# Summary of ASVs with statistically significant (*P* < 0.05) adjusted p-values
cat(
  nrow(asv_canker_results[p_adjusted < 0.05, ]), 
  "ASVs have statistically significant (*P* < 0.05) adjusted p-values"
)
# 1 ASVs have statistically significant (*P* < 0.05) adjusted p-values
asv_canker_results[p_adjusted < 0.05, ]
#      ASV           Taxonomy Abundance       coef      var            p warning
# 1: ASV40 Mycobacteriales(o)  156.6922 -0.5353163 1.070081 0.0004620826      NA
#    p_adjusted
# 1: 0.03280786

Effect of ASV abundance on canker count per site

# For each site, select ASVs with mean abundance > 100
top_asvs_per_site <- lapply(
  unique(colData$Site),
  function(site) {
    samples <- filter(colData, Site == site)
    top_asv_data <- select(asv_counts, rownames(samples))
    top_asvs <- filter(top_asv_data, rowMeans(top_asv_data) > 100)
    top_asv_ids <- rownames(top_asvs)
    top_asvs <- data.frame(t(top_asvs)) %>% merge(samples, by = 0) %>% column_to_rownames("Row.names")
    top_asvs <- top_asvs[complete.cases(top_asvs$Cankers), ]
    return(list(asvs = top_asv_ids, data = top_asvs))
  }
)

cat(
  "Sites 1, 2, 3 contained",
  paste(sapply(top_asvs_per_site, function(x) length(x$asvs)), collapse = ", "),
  "ASVs respectively with mean read count > 100", "\n\n"
)
# Sites 1, 2, 3 contained  ASVs respectively with mean read count > 100
canker_site_design <- "Cankers ~ Storage * Scion"

# ANOVA of ASV abundance with canker count per ASV
asv_canker_site_anova <- function(asvs, data) {
  base_model <- glm.nb(canker_site_design, data = data)
  results <- sapply(
    asvs, 
    function(asv) asv_canker_anova(asv, canker_site_design, base_model, data)
  ) %>% t() %>% data.table()
  results$p_adjusted <- p.adjust(results$p, method = "BH")
  return(results)
}

# Run ANOVA per site
asv_canker_site_results <- lapply(
  top_asvs_per_site,
  function(x) asv_canker_site_anova(x$asvs, x$data)
)

# Add site to each result as new column and merge into single data.table
asv_canker_site_results <- lapply(
  1:3, 
  function(site) {
    result <- asv_canker_site_results[[site]]
    result$Site <- site
    result
  }
) %>% bind_rows()
# Error in asv_canker_site_results[[site]]: subscript out of bounds
# Significant ASVs
significant_asvs <- asv_canker_site_results[p_adjusted < 0.05 & is.na(warning), ]
# Error in eval(expr, envir, enclos): object 'p_adjusted' not found
significant_asvs
#         ASV               Taxonomy Abundance      coef      var            p
#  1:    ASV2      Kineosporiales(o)  670.9308 0.6280854 6.996585  0.001849117
#  2:   ASV27  Micromonosporaceae(f)  320.4859 0.4042703 2.946708    0.0047199
#  3:    ASV3     Kineosporiaceae(f)   780.185 0.6215193 10.36089  0.001205674
#  4:    ASV4        Streptomyces(g)  400.1423  0.945169 2.159619  2.24083e-05
#  5:    ASV7      Bradyrhizobium(g)  458.6015 0.5898238   3.7398  0.006447539
#  6:   ASV81 Steroidobacteraceae(f)  102.7877 0.4771125 3.930028  0.005419897
#  7: ASV1043        Streptomyces(g)  125.6528 -1.625193 12.86475 0.0003196356
#  8:  ASV124   Micromonosporales(o)  228.8759  -1.12016 7.142329   0.00166798
#  9:  ASV226        Streptomyces(g)  133.4371 -0.890692 11.51696  0.004341775
# 10:   ASV26        Streptomyces(g)   334.252 -1.176118 11.11247  0.006943457
# 11:  ASV497  Micromonosporaceae(f)  117.4129 -2.338883 11.94193 7.702484e-06
# 12: ASV5732        Streptomyces(g)  107.9548 -1.099433 14.36652 0.0007480738
#     warning   p_adjusted Site
#  1:      NA 0.0240385172    2
#  2:      NA 0.0419090058    2
#  3:      NA 0.0235106395    2
#  4:      NA 0.0008739235    2
#  5:      NA 0.0419090058    2
#  6:      NA 0.0419090058    2
#  7:      NA 0.0062328945    3
#  8:      NA 0.0144558263    3
#  9:      NA 0.0282215361    3
# 10:      NA 0.0386849740    3
# 11:      NA 0.0004245090    3
# 12:      NA 0.0093448412    3
# Export significant ASVs as FASTA
write.fasta(
  sequences = BAC_asvs[as.character(significant_asvs$ASV)],
  names = paste(significant_asvs$ASV, taxData[as.character(significant_asvs$ASV), "rank"]),
  file = "fasta/BAC_canker_asvs.fasta"
)
Plot of ASV abundance against canker count
# List of significant ASVs
significant_asv_list <- significant_asvs$ASV %>% unlist()

significant_asv_data <- asv_counts[significant_asv_list, ] %>% 
  t() %>% 
  data.frame() %>% 
  merge(colData, by = 0) %>% 
  column_to_rownames("Row.names") %>%
  select(c(significant_asv_list, "Site", "Storage", "Scion", "Cankers"))
# Error in `select()`:
# ! Can't subset columns that don't exist.
# ✖ Column `Site` doesn't exist.
# Melt data for ggplot
significant_asv_long_data <- significant_asv_data %>% reshape2::melt(
  id.vars = c("Site", "Storage", "Scion", "Cankers"), variable.name = "ASV", value.name = "Abundance"
)

# Log trasnform abundance
significant_asv_long_data$log10_abundance <- log10(significant_asv_long_data$Abundance + 1)

bac_asv_canker_plot <- ggscatter(
  data = significant_asv_long_data, x = "log10_abundance", y = "Cankers", 
  color = "Storage", facet.by = c("ASV", "Site"),
  xlab = "ASV abundance (log10)", ylab = "Canker count",
  palette = cbPalette, legend = "bottom"
)

ggsave(
  filename = "bac_asv_canker_plot.png", plot = bac_asv_canker_plot, path = "figures/",
  height = 40, width = 20, units = "cm"
)

bac_asv_canker_plot

Effect of aggregated genera on canker counts

# Add genus from taxData to countData
bac_genus_data <- counts(dds, normalize = T) %>% as.data.frame() %>% mutate(
  genus = taxData[rownames(countData), "genus"]
)

# Group by genus and set genus to rownames
bac_genus_data <- bac_genus_data %>% group_by(genus) %>% summarise_all(sum) %>% as.data.frame()

# Set rownames as genus
rownames(bac_genus_data) <- bac_genus_data$genus
bac_genus_data <- dplyr::select(bac_genus_data, -genus)

# Filter genera with mean abundance < 100
bac_genus_data <- bac_genus_data[rowMeans(bac_genus_data) > 10, ]

# Rank not genus
not_genus <- rownames(bac_genus_data)[grep("\\([a-z]\\)", rownames(bac_genus_data))]
# Remove rows with genus in not_genus
bac_genus_data <- bac_genus_data[!rownames(bac_genus_data) %in% not_genus, ]
cat(
  length(not_genus), "non-genus ranks removed:\n\n",
  not_genus, "\n"
)
# 0 non-genus ranks removed:
# 
# 
# Remove any with slashes or underscores
removals <- rownames(bac_genus_data)[grep("[/_]", rownames(bac_genus_data))]
# Remove rows with symbols in the name
bac_genus_data <- bac_genus_data[!rownames(bac_genus_data) %in% removals, ]
cat(
  length(removals), "ranks with slashes or underscores removed:\n\n",
  removals, "\n"
)
# 4 ranks with slashes or underscores removed:
# 
#  Armatimonas/Armatimonadetes_gp1 Candidatus_Solibacter Saccharibacteria_genera_incertae_sedis Subdivision3_genera_incertae_sedis
# Set rownames as genus
bac_genera <- rownames(bac_genus_data)

# Transpose and add metadata from colData
bac_genus_data <- t(bac_genus_data) %>% as.data.frame() %>% mutate(
  Site = colData[rownames(.), "Site"],
  Storage = colData[rownames(.), "Storage"],
  Scion = colData[rownames(.), "Scion"],
  Cankers = colData[rownames(.), "Cankers"]
)

# Filter out samples with missing canker count
bac_genus_data <- bac_genus_data[complete.cases(bac_genus_data$Cankers), ]
# Base model design
canker_design = "Cankers ~ Site * Storage * Scion"

# Base model with genus abundance data
base_model <- glm.nb(canker_design, data = bac_genus_data)
# Error in eval(predvars, data, env): object 'Site' not found
# ANOVA of genus abundance with canker count
genus_canker_anova <- function(genus, design, base_model, data) {
  log_genus = paste0("log(", genus, " + 1)")
  f = paste(design, "+", log_genus)#, "+", log_genus, ":Site")
  m = glm.nb(f, data = data)
  a = anova(base_model, m) %>% data.frame()
  b = suppressWarnings(anova(m)) %>% data.frame()
  total_deviance = sum(b$Deviance, na.rm = T) + tail(b$Resid..Dev, 1)
  d = data.table(
    Genus = genus,
    Abundance = mean(data[[genus]]),
    coef = m$coefficients[log_genus],
    var = b[log_genus, 'Deviance'] / total_deviance * 100,
    p = a[2, 'Pr.Chi.']
  )
  return(d)
}

# genus_canker_anova(bac_genera[1], canker_design, base_model, bac_genus_data)

# Effect of genera abundance on canker counts
genus_canker_results <- sapply(
  bac_genera, 
  function(x) genus_canker_anova(x, canker_design, base_model, bac_genus_data)
) %>% t() %>% data.table()
# Error in h(simpleError(msg, call)): error in evaluating the argument 'x' in selecting a method for function 't': object 'Site' not found
# Adjust p-values for multiple testing
genus_canker_results$p_adjusted <- p.adjust(genus_canker_results$p, method = "BH")

# Summary of ASVs with statistically significant (*P* < 0.05) adjusted p-values
cat(
  nrow(genus_canker_results[p_adjusted < 0.05, ]), "of", nrow(genus_canker_results),
  "genera have statistically significant (*P* < 0.05) adjusted p-values\n\n"
)
# 2 of 238 genera have statistically significant (*P* < 0.05) adjusted p-values
if(nrow(genus_canker_results[p_adjusted < 0.05, ]) > 0) {
  genus_canker_results[p_adjusted < 0.05, ]
}
#               Genus Abundance       coef      var            p   p_adjusted
# 1: Marisediminicola  11.01665 -0.7908458 1.442856 1.264145e-05 1.504332e-03
# 2:    Pedomicrobium  22.65009  0.7736918 1.317756 3.761193e-07 8.951639e-05
sig_genus <- genus_canker_results[p_adjusted < 0.05]$Genus %>% unlist()

for (genus in sig_genus) {
  log_genus = paste0("log(", genus, " + 1)")
  f = paste(canker_design, "+", log_genus)
  m = glm.nb(f, data = bac_genus_data)
  print(anova(base_model, m))
}
# Error in eval(predvars, data, env): object 'Site' not found
Plot of genus abundance against canker count
significant_genera <- genus_canker_results[p_adjusted < 0.05]$Genus %>% unlist()

significant_genera_data <- bac_genus_data[, c(significant_genera, FACTORS, "Cankers")]
# Error in `[.data.frame`(bac_genus_data, , c(significant_genera, FACTORS, : undefined columns selected
# Melt data for ggplot
significant_genera_data <- significant_genera_data %>% reshape2::melt(
  id.vars = c(FACTORS, "Cankers"), variable.name = "Genus", value.name = "Abundance"
)

# Log transform abundance
significant_genera_data$log10_abundance <- log10(significant_genera_data$Abundance + 1)
# Error in significant_genera_data$Abundance + 1: non-numeric argument to binary operator
# Plot of genus abundance against canker count
bac_genus_canker_plot <- ggscatter(
  data = significant_genera_data, x = "log10_abundance", y = "Cankers", 
  color = "Site", shape = "Storage",
  xlab = "Genus abundance (log10)", ylab = "Canker count",
  free_x = T, free_y = T, palette = cbPalette
) + facet_wrap(~ Genus, scales = "free")

ggsave(
  filename = "bac_genus_canker_plot.png", plot = bac_genus_canker_plot, path = "figures/",
  height = 10, width = 20, units = "cm"
)
# Error:
# ! Problem while converting geom to grob.
# ℹ Error occurred in the 1st layer.
# Caused by error in `UseMethod()`:
# ! no applicable method for 'rescale' applied to an object of class "character"

Effect of α-diversity on canker count

# ANOVA of α-diversity with canker count

# Base model with α-diversity data
base_model <- glm.nb(canker_design, data = all_alpha_ord)
# Error in eval(predvars, data, env): object 'Site' not found
measures <- c("S.chao1", "shannon", "simpson")

# ANOVA of α-diversity with canker count
alpha_canker_anova <- function(measure, data) {
  f = paste(canker_design, "+", measure)
  m = glm.nb(f, data = data)
  a = anova(base_model, m) %>% data.frame()
  b = anova(m) %>% data.frame()
  total_deviance = sum(b$Deviance, na.rm = T) + tail(b$Resid..Dev, 1)
  d = data.frame(
    measure = measure,
    coef = m$coefficients[measure],
    var = b[measure, 'Deviance'] / total_deviance * 100,
    p = a[2, 'Pr.Chi.']
  )
  return(d)
}

# Effect of α-diversity on canker count for each measure
alpha_canker_results <- data.table(t(sapply(measures, function(x) alpha_canker_anova(x, all_alpha_ord))))
# Error in h(simpleError(msg, call)): error in evaluating the argument 'x' in selecting a method for function 't': object 'Site' not found
alpha_canker_results
#    measure         coef       var          p
# 1: S.chao1 0.0003260492 0.1949192  0.3631458
# 2: shannon     1.047428  3.043469 0.03415997
# 3: simpson     74.20115  2.554056 0.01184907
# ANOVA results
for (measure in measures) {
  f = paste(canker_design, "+", measure)
  m = glm.nb(f, data = all_alpha_ord)
  print(anova(base_model, m))
}
# Error in eval(predvars, data, env): object 'Site' not found

Effect of β-diversity on canker count

no_pcs <- 4

# Merge PC scores with canker data
pc_scores <- merge(colData, data.frame(mypca$x[, 1:no_pcs]), by = "row.names") %>% 
  column_to_rownames("Row.names")

pcs <- tail(colnames(pc_scores), no_pcs)

# Base model with β-diversity data
base_model <- glm.nb(canker_design, data = pc_scores)
# Error in eval(predvars, data, env): object 'Site' not found
# ANOVA of β-diversity with canker count
beta_canker_anova <- function(pc, data) {
  f = paste0(canker_design, "+", pc)
  m = glm.nb(f, data = data)
  a = anova(base_model, m) %>% data.frame()
  b = anova(m) %>% data.frame()
  total_deviance = sum(b$Deviance, na.rm = T) + tail(b$Resid..Dev, 1)
  d = data.frame(
    PC = pc,
    coef = m$coefficients[pc],
    var = b[pc, 'Deviance'] / total_deviance * 100,
    p = a[2, 'Pr.Chi.']
  )
  return(d)
}

# Effect of β-diversity on canker count for each PC
beta_canker_results <- data.table(t(sapply(pcs, function(x) beta_canker_anova(x, pc_scores))))
# Error in h(simpleError(msg, call)): error in evaluating the argument 'x' in selecting a method for function 't': object 'Site' not found
beta_canker_results
#     PC         coef        var            p
# 1: PC1  -0.02475561  0.2245606 0.0001345953
# 2: PC2 -0.007178287 0.02763263    0.3213615
# 3: PC3 -0.003666729  0.3145477    0.4859172
# 4: PC4   0.01570856  0.2983212  0.009036866

Extra figures

Abundance

abundance_combined <- rbind(
  as.data.frame(colData(ubiome_FUN$dds)), as.data.frame(colData(ubiome_BAC$dds))
) %>% mutate(kingdom = ifelse(Target == "ITS", "Fungi", "Bacteria")) %>%
  mutate(kingdom = factor(kingdom, levels = c("Fungi", "Bacteria"))) %>%
  mutate(Storage = factor(Storage, levels = c("no", "yes")))
# Error in factor(Storage, levels = c("no", "yes")): object 'Storage' not found
# abundance_bar <- ggbarplot(
#   data = abundance_combined, x = "Storage", y = "log_copy_number", 
#   fill = "Site", add = "mean_se", facet.by = "kingdom",
#   palette = cbPalette, position = position_dodge(0.8),
#   ylab = "Mean copy number (log10)", xlab = F, legend = "right"
# ) + guides(fill = guide_legend(title = "Site"))

# ggsave(
#   filename = "abundance_bar.png", plot = abundance_bar, path = "figures/", 
#   height = 12, width = 24, units = "cm"
# )

# abundance_bar

abundance_box <- ggboxplot(
  data = abundance_combined, x = "Site", y = "log_copy_number", 
  color = "Storage", add = "jitter", facet.by = "kingdom",
  palette = cbPalette, legend = "bottom",
  ylab = "Mean copy number (log10)", xlab = "Site"
) 

ggsave(
  filename = "abundance_box.png", plot = abundance_box, path = "figures/", 
  height = 12, width = 24, units = "cm"
)

abundance_box

Alpha diversity

alpha_combined <- rbind(fun_alpha, bac_alpha) %>% 
  subset(select = c("Site", "Storage", "Scion", "Target", "S.chao1", "shannon", "simpson")) %>%
  mutate(kingdom = ifelse(Target == "ITS", "Fungi", "Bacteria")) %>%
  mutate(kingdom = factor(kingdom, levels = c("Fungi", "Bacteria"))) %>%
  mutate(Storage = factor(Storage, levels = c("no", "yes"))) %>%
  rename(Chao1 = S.chao1, Shannon = shannon, Simpson = simpson) %>%
  pivot_longer(
    cols = c("Chao1", "Shannon", "Simpson"), names_to = "measure", values_to = "value"
  )
# Error in rename(., Chao1 = S.chao1, Shannon = shannon, Simpson = simpson): unused arguments (Chao1 = S.chao1, Shannon = shannon, Simpson = simpson)
# alpha_bar <- ggbarplot(
#   data = alpha_combined, x = "Storage", y = "value", 
#   fill = "Site", add = "mean_se", facet.by = c("measure", "kingdom"), 
#   palette = cbPalette, position = position_dodge(0.8), scales = "free_y",
#   ylab = "Mean diversity index", xlab = F, legend = "right"
# ) + guides(fill = guide_legend(title = "Site"))

# ggsave(
#   filename = "alpha_bar.png", plot = alpha_bar, path = "figures/", 
#   height = 12, width = 24, units = "cm"
# )

# alpha_bar

alpha_box <- ggboxplot(
  data = alpha_combined, x = "Site", y = "value", 
  color = "Storage", add = "jitter", facet.by = c("measure", "kingdom"),
  palette = cbPalette, scales = "free_y", legend = "bottom",
  ylab = "Mean diversity index", xlab = "Site"
) #+ guides(color = guide_legend(position = "right"))
# Error in ggboxplot(data = alpha_combined, x = "Site", y = "value", color = "Storage", : object 'alpha_combined' not found
ggsave(
  filename = "alpha_box.png", plot = alpha_box, path = "figures/", 
  height = 24, width = 24, units = "cm"
)
# Error in plot_theme(plot): object 'alpha_box' not found
alpha_box
# Error in eval(expr, envir, enclos): object 'alpha_box' not found
alpha_box_fungi <- ggboxplot(
  data = alpha_combined[alpha_combined$kingdom == "Fungi", ], x = "Site", y = "value", 
  color = "Storage", add = "jitter", facet.by = "measure",
  palette = cbPalette, scales = "free_y", legend = "bottom",
  ylab = "Mean diversity index", xlab = "Site"
)
# Error in ggboxplot(data = alpha_combined[alpha_combined$kingdom == "Fungi", : object 'alpha_combined' not found
ggsave(
  filename = "alpha_box_fungi.png", plot = alpha_box_fungi, path = "figures/", 
  height = 12, width = 24, units = "cm"
)
# Error in plot_theme(plot): object 'alpha_box_fungi' not found
alpha_box_bacteria <- ggboxplot(
  data = alpha_combined[alpha_combined$kingdom == "Bacteria", ], x = "Site", y = "value", 
  color = "Storage", add = "jitter", facet.by = "measure",
  palette = cbPalette, scales = "free_y", legend = "bottom",
  ylab = "Mean diversity index", xlab = "Site"
)
# Error in ggboxplot(data = alpha_combined[alpha_combined$kingdom == "Bacteria", : object 'alpha_combined' not found
ggsave(
  filename = "alpha_box_bacteria.png", plot = alpha_box_bacteria, path = "figures/", 
  height = 12, width = 24, units = "cm"
)
# Error in plot_theme(plot): object 'alpha_box_bacteria' not found

PCA

pca_combo_plot <- ggarrange(
  fun_pca_plot, bac_pca_plot,
  ncol = 2, nrow = 1, labels = c("A", "B"),
  common.legend = T, legend = "bottom"
)

ggsave(
  filename = "pca_combo_plot.png", plot = pca_combo_plot, path = "figures/", 
  height = 10, width = 24, units = "cm"
)

pca_combo_plot

nmds_combo_plot <- ggarrange(
  fun_nmds_plot, bac_nmds_plot,
  ncol = 2, nrow = 1, widths = c(1.19, 1),
  common.legend = T, legend = "bottom"
)

ggsave(
  filename = "nmds_combo_plot.png", plot = nmds_combo_plot, path = "figures/", 
  height = 13, width = 24, units = "cm"
)

nmds_combo_plot

mega_combo_plot <- ggarrange(
  fun_pca_plot, bac_pca_plot,
  fun_nmds_plot, bac_nmds_plot,
  ncol = 2, nrow = 2, labels = c("A", "B", "C", "D"),
  common.legend = T, legend = "bottom"
) + labs(shape = "Storage")

ggsave(
  filename = "mega_combo_plot.png", plot = mega_combo_plot, path = "figures/", 
  height = 25, width = 30, units = "cm"
)

mega_combo_plot

# Save environment
save.image("BAC.RData")